diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 0ed62172..e8481a97 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -88,6 +88,7 @@ copyright-date: -D GPU_TARGETS="$GPU_TARGETS" -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx + -D CMAKE_CXX_STANDARD=14 -B $CI_PROJECT_DIR/rocPRIM/build -S $CI_PROJECT_DIR/rocPRIM - cd $CI_PROJECT_DIR/rocPRIM/build @@ -116,6 +117,7 @@ build:rocm: -D ROCM_SYMLINK_LIBS=OFF -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx + -D CMAKE_CXX_STANDARD="$BUILD_VERSION" -B $CI_PROJECT_DIR/build -S $CI_PROJECT_DIR - cmake --build $CI_PROJECT_DIR/build @@ -133,6 +135,9 @@ build:rocm: - $CI_PROJECT_DIR/build/hipcub*.zip - $CI_PROJECT_DIR/build/.ninja_log expire_in: 2 weeks + parallel: + matrix: + - BUILD_VERSION: [14, 17] build:rocm-benchmark: extends: @@ -153,6 +158,7 @@ build:rocm-benchmark: -D GPU_TARGETS="$GPU_TARGETS" -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx + -D CMAKE_CXX_STANDARD=14 -B $CI_PROJECT_DIR/build -S $CI_PROJECT_DIR - cmake --build $CI_PROJECT_DIR/build @@ -176,6 +182,7 @@ test:rocm: - cd $CI_PROJECT_DIR/build - cmake -D CMAKE_PREFIX_PATH=/opt/rocm + -D CMAKE_CXX_STANDARD=14 -P $CI_PROJECT_DIR/cmake/GenerateResourceSpec.cmake - cat ./resources.json # Parallel execution (with other AMDGPU processes) can oversubscribe the SDMA queue. @@ -231,6 +238,7 @@ benchmark:rocm: -G Ninja -D CMAKE_CXX_FLAGS="-Wall -Wextra -Werror" "$GPU_TARGETS_ARG" + -D CMAKE_CXX_STANDARD=14 -S $CI_PROJECT_DIR/test/extra -B $CI_PROJECT_DIR/build/package_test - cmake --build $CI_PROJECT_DIR/build/package_test @@ -251,6 +259,7 @@ benchmark:rocm: - cmake -G Ninja -D BUILD_TEST=OFF + -D CMAKE_CXX_STANDARD=14 -S $CI_PROJECT_DIR -B $CI_PROJECT_DIR/build_only_install # Preserve $PATH when sudoing @@ -318,6 +327,7 @@ build:nvcc: -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx -D CMAKE_CUDA_COMPILER_LAUNCHER=phc_sccache_cuda + -D CMAKE_CXX_STANDARD="$BUILD_VERSION" -B $CI_PROJECT_DIR/build -S $CI_PROJECT_DIR - cmake --build $CI_PROJECT_DIR/build @@ -335,6 +345,9 @@ build:nvcc: - $CI_PROJECT_DIR/build/hipcub*.zip - $CI_PROJECT_DIR/build/.ninja_log expire_in: 2 weeks + parallel: + matrix: + - BUILD_VERSION: [14, 17] build:nvcc-benchmark: stage: build @@ -354,6 +367,7 @@ build:nvcc-benchmark: -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx -D CMAKE_CUDA_COMPILER_LAUNCHER=phc_sccache_cuda + -D CMAKE_CXX_STANDARD=14 -B $CI_PROJECT_DIR/build -S $CI_PROJECT_DIR - cmake --build $CI_PROJECT_DIR/build diff --git a/CHANGELOG.md b/CHANGELOG.md index f61d0495..fc021075 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,10 +10,13 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/project * Added the `hipcub::CubVector` type for CUB parity. * Added `--emulation` option for `rtest.py` * Unit tests can be run with `[--emulation|-e|--test|-t]=` +* Added `DeviceSelect::FlaggedIf` and its inplace overload. ### Changed * Changed the subset of tests that are run for smoke tests such that the smoke test will complete with faster run-time and to never exceed 2GB of vram usage. Use `python rtest.py [--emulation|-e|--test|-t]=smoke` to run these tests. * The `rtest.py` options have changed. `rtest.py` is now run with at least either `--test|-t` or `--emulation|-e`, but not both options. +* The NVIDIA backend now requires CUB, Thrust and libcu++ 2.5.0. If it is not found it will be downloaded from the NVIDIA CCCL repository. +* Changed the C++ version from 14 to 17. C++14 will be deprecated in the next major release. ## hipCUB-3.3.0 for ROCm 6.3.0 diff --git a/CMakeLists.txt b/CMakeLists.txt index ed8f95a5..424d3efc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -61,10 +61,19 @@ endif() set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOL "Add paths to linker search and installed rpath") # Set CXX flags -set(CMAKE_CXX_STANDARD 14) +if (NOT DEFINED CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) + +if (CMAKE_CXX_STANDARD EQUAL 14) + message(WARNING "C++14 will be deprecated in the next major release") +elseif(NOT CMAKE_CXX_STANDARD EQUAL 17) + message(FATAL_ERROR "Only C++14 and C++17 are supported") +endif() + # rocm-cmake has to be included early so that it's available to set GPU_TARGETS # If hip is included prior to setting that then it defaults to building only for the current architecture include(ROCmCMakeBuildToolsDependency) diff --git a/README.md b/README.md index 0c345911..6a0ad19b 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ python3 -m http.server * Requires CMake 3.16.9 or later * For NVIDIA GPUs: * CUDA Toolkit - * CCCL library (>= 2.4.0) + * CCCL library (>= 2.5.0) * Automatically downloaded and built by the CMake script * Requires CMake 3.15.0 or later * Python 3.6 or higher (for HIP on Windows only; this is only required for install scripts) diff --git a/benchmark/benchmark_device_adjacent_difference.cpp b/benchmark/benchmark_device_adjacent_difference.cpp index e0788f0b..335144c0 100644 --- a/benchmark/benchmark_device_adjacent_difference.cpp +++ b/benchmark/benchmark_device_adjacent_difference.cpp @@ -178,12 +178,12 @@ void run_benchmark(benchmark::State& state, const std::size_t size, const hipStr state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); - hipFree(d_input); + HIP_CHECK(hipFree(d_input)); if(copy) { - hipFree(d_output); + HIP_CHECK(hipFree(d_output)); } - hipFree(d_temp_storage); + HIP_CHECK(hipFree(d_temp_storage)); } } // namespace diff --git a/benchmark/benchmark_device_run_length_encode.cpp b/benchmark/benchmark_device_run_length_encode.cpp index 267185c7..b7ef64be 100644 --- a/benchmark/benchmark_device_run_length_encode.cpp +++ b/benchmark/benchmark_device_run_length_encode.cpp @@ -111,14 +111,14 @@ void run_encode_benchmark(benchmark::State& state, for(size_t i = 0; i < batch_size; i++) { - hipcub::DeviceRunLengthEncode::Encode(d_temporary_storage, - temporary_storage_bytes, - d_input, - d_unique_output, - d_counts_output, - d_runs_count_output, - size, - stream); + HIP_CHECK(hipcub::DeviceRunLengthEncode::Encode(d_temporary_storage, + temporary_storage_bytes, + d_input, + d_unique_output, + d_counts_output, + d_runs_count_output, + size, + stream)); } HIP_CHECK(hipStreamSynchronize(stream)); @@ -214,14 +214,14 @@ void run_non_trivial_runs_benchmark(benchmark::State& state, for(size_t i = 0; i < batch_size; i++) { - hipcub::DeviceRunLengthEncode::NonTrivialRuns(d_temporary_storage, - temporary_storage_bytes, - d_input, - d_offsets_output, - d_counts_output, - d_runs_count_output, - size, - stream); + HIP_CHECK(hipcub::DeviceRunLengthEncode::NonTrivialRuns(d_temporary_storage, + temporary_storage_bytes, + d_input, + d_offsets_output, + d_counts_output, + d_runs_count_output, + size, + stream)); } HIP_CHECK(hipStreamSynchronize(stream)); diff --git a/benchmark/benchmark_device_select.cpp b/benchmark/benchmark_device_select.cpp index 2cd5e4cf..a14cbddb 100644 --- a/benchmark/benchmark_device_select.cpp +++ b/benchmark/benchmark_device_select.cpp @@ -113,11 +113,11 @@ void run_flagged_benchmark(benchmark::State& state, state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); - hipFree(d_input); - hipFree(d_flags); - hipFree(d_output); - hipFree(d_selected_count_output); - hipFree(d_temp_storage); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_flags)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_selected_count_output)); + HIP_CHECK(hipFree(d_temp_storage)); HIP_CHECK(hipDeviceSynchronize()); } @@ -211,7 +211,104 @@ void run_selectop_benchmark(benchmark::State& state, state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_selected_count_output)); + HIP_CHECK(hipFree(d_temp_storage)); + HIP_CHECK(hipDeviceSynchronize()); +} + +template +void run_flagged_if_benchmark(benchmark::State& state, + size_t size, + const hipStream_t stream, + float true_probability) +{ + std::vector input + = benchmark_utils::get_random_data(size, + benchmark_utils::generate_limits::min(), + benchmark_utils::generate_limits::max()); + + std::vector flags + = benchmark_utils::get_random_data01(size, true_probability); + + SelectOperator select_flag_op(true_probability); + + T* d_input; + FlagType* d_flags; + T* d_output; + unsigned int* d_selected_count_output; + HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_flags, flags.size() * sizeof(FlagType))); + HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK( + hipMemcpy(d_flags, flags.data(), flags.size() * sizeof(FlagType), hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + // Allocate temporary storage memory + size_t temp_storage_size_bytes = 0; + + // Get size of d_temp_storage + HIP_CHECK(hipcub::DeviceSelect::FlaggedIf(nullptr, + temp_storage_size_bytes, + d_input, + d_flags, + d_output, + d_selected_count_output, + input.size(), + select_flag_op, + stream)); + HIP_CHECK(hipDeviceSynchronize()); + + // allocate temporary storage + void* d_temp_storage = nullptr; + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); + HIP_CHECK(hipDeviceSynchronize()); + + // Warm-up + for(size_t i = 0; i < 10; i++) + { + HIP_CHECK(hipcub::DeviceSelect::FlaggedIf(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_flags, + d_output, + d_selected_count_output, + input.size(), + select_flag_op, + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + const unsigned int batch_size = 10; + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + for(size_t i = 0; i < batch_size; i++) + { + HIP_CHECK(hipcub::DeviceSelect::FlaggedIf(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_flags, + d_output, + d_selected_count_output, + input.size(), + select_flag_op, + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); + state.SetItemsProcessed(state.iterations() * batch_size * size); + hipFree(d_input); + hipFree(d_flags); hipFree(d_output); hipFree(d_selected_count_output); hipFree(d_temp_storage); @@ -301,10 +398,10 @@ void run_unique_benchmark(benchmark::State& state, state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); - hipFree(d_input); - hipFree(d_output); - hipFree(d_selected_count_output); - hipFree(d_temp_storage); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_selected_count_output)); + HIP_CHECK(hipFree(d_temp_storage)); } template @@ -414,12 +511,12 @@ void run_unique_by_key_benchmark(benchmark::State& state, * (sizeof(KeyT) + sizeof(ValueT))); state.SetItemsProcessed(state.iterations() * batch_size * size); - hipFree(d_keys_input); - hipFree(d_values_input); - hipFree(d_keys_output); - hipFree(d_values_output); - hipFree(d_selected_count_output); - hipFree(d_temp_storage); + HIP_CHECK(hipFree(d_keys_input)); + HIP_CHECK(hipFree(d_values_input)); + HIP_CHECK(hipFree(d_keys_output)); + HIP_CHECK(hipFree(d_values_output)); + HIP_CHECK(hipFree(d_selected_count_output)); + HIP_CHECK(hipFree(d_temp_storage)); } #define CREATE_SELECT_FLAGGED_BENCHMARK(T, F, p) \ @@ -442,6 +539,17 @@ void run_unique_by_key_benchmark(benchmark::State& state, stream, \ p) +#define CREATE_SELECT_FLAGGED_IF_BENCHMARK(T, F, p) \ + benchmark::RegisterBenchmark( \ + std::string("device_select_flagged_if.(probability:" #p ")") \ + .c_str(), \ + &run_flagged_if_benchmark, \ + size, \ + stream, \ + p) + #define CREATE_UNIQUE_BENCHMARK(T, p) \ benchmark::RegisterBenchmark( \ std::string("device_select_unique -__global__ __launch_bounds__(64) void warp_reduce_kernel(const T* d_input, T* d_output) +__global__ __launch_bounds__(64) +auto warp_reduce_kernel(const T* d_input, T* d_output) + -> std::enable_if_t> { const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; @@ -48,10 +50,16 @@ __global__ __launch_bounds__(64) void warp_reduce_kernel(const T* d_input, T* d_ d_output[i] = value; } +template +__global__ __launch_bounds__(64) +auto warp_reduce_kernel(const T* /*d_input*/, T* /*d_output*/) + -> std::enable_if_t> +{} + template -__global__ __launch_bounds__(64) void segmented_warp_reduce_kernel(const T* d_input, - Flag* d_flags, - T* d_output) +__global__ __launch_bounds__(64) +auto segmented_warp_reduce_kernel(const T* d_input, Flag* d_flags, T* d_output) + -> std::enable_if_t> { const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; @@ -69,6 +77,12 @@ __global__ __launch_bounds__(64) void segmented_warp_reduce_kernel(const T* d_in d_output[i] = value; } +template +__global__ __launch_bounds__(64) +auto segmented_warp_reduce_kernel(const T* /*d_input*/, Flag* /*d_flags*/, T* /*d_output*/) + -> std::enable_if_t> +{} + template - __device__ static void run(const T* input, T* output, const T init) + __device__ + static auto run(const T* input, T* output, const T init) + -> std::enable_if_t> { (void)init; @@ -63,12 +65,20 @@ struct inclusive_scan output[i] = value; } + + template + __device__ + static auto run(const T* /*input*/, T* /*output*/, const T /*init*/) + -> std::enable_if_t> + {} }; struct exclusive_scan { template - __device__ static void run(const T* input, T* output, const T init) + __device__ + static auto run(const T* input, T* output, const T init) + -> std::enable_if_t> { const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; auto value = input[i]; @@ -84,12 +94,19 @@ struct exclusive_scan output[i] = value; } + template + __device__ + static auto run(const T* /*input*/, T* /*output*/, const T /*init*/) + -> std::enable_if_t> + {} }; struct broadcast { template - __device__ static void run(const T* input, T* output, const T init) + __device__ + static auto run(const T* input, T* output, const T init) + -> std::enable_if_t> { (void)init; @@ -106,6 +123,12 @@ struct broadcast output[i] = value; } + + template + __device__ + static auto run(const T* /*input*/, T* /*output*/, const T /*init*/) + -> std::enable_if_t> + {} }; template, BLOCK_THREADS)); + HIP_CHECK(MaxSmOccupancy(max_sm_occupancy, + BlockSortKernel, + BLOCK_THREADS)); // Copy problem to device - HipcubDebug(hipMemcpy(d_in, h_in, sizeof(Key) * TILE_SIZE * g_grid_size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_in, h_in, sizeof(Key) * TILE_SIZE * g_grid_size, hipMemcpyHostToDevice)); printf("BlockRadixSort %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n", TILE_SIZE * g_grid_size, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy); @@ -218,8 +220,8 @@ void Test() d_elapsed); // Check for kernel errors and STDIO from the kernel, if any - HipcubDebug(cudaPeekAtLastError()); - HipcubDebug(cudaDeviceSynchronize()); + HIP_CHECK(cudaPeekAtLastError()); + HIP_CHECK(cudaDeviceSynchronize()); // Check results printf("\tOutput items: "); @@ -247,7 +249,8 @@ void Test() elapsed_millis += timer.ElapsedMillis(); // Copy clocks from device - HipcubDebug(hipMemcpy(h_elapsed, d_elapsed, sizeof(clock_t) * g_grid_size, hipMemcpyDeviceToHost)); + HIP_CHECK( + hipMemcpy(h_elapsed, d_elapsed, sizeof(clock_t) * g_grid_size, hipMemcpyDeviceToHost)); for (int j = 0; j < g_grid_size; j++) { elapsed_clocks += h_elapsed[j]; @@ -255,7 +258,7 @@ void Test() } // Check for kernel errors and STDIO from the kernel, if any - HipcubDebug(cudaDeviceSynchronize()); + HIP_CHECK(cudaDeviceSynchronize()); // Display timing results float avg_millis = elapsed_millis / g_timing_iterations; @@ -273,9 +276,12 @@ void Test() if (h_in) delete[] h_in; if (h_reference) delete[] h_reference; if (h_elapsed) delete[] h_elapsed; - if (d_in) HipcubDebug(cudaFree(d_in)); - if (d_out) HipcubDebug(cudaFree(d_out)); - if (d_elapsed) HipcubDebug(cudaFree(d_elapsed)); + if(d_in) + HIP_CHECK(cudaFree(d_in)); + if(d_out) + HIP_CHECK(cudaFree(d_out)); + if(d_elapsed) + HIP_CHECK(cudaFree(d_elapsed)); } @@ -304,7 +310,7 @@ int main(int argc, char** argv) } // Initialize device - HipcubDebug(args.DeviceInit()); + HIP_CHECK(args.DeviceInit()); fflush(stdout); // Run tests diff --git a/examples/block/example_block_reduce.cu b/examples/block/example_block_reduce.cu index 537a2dfd..6d472b73 100644 --- a/examples/block/example_block_reduce.cu +++ b/examples/block/example_block_reduce.cu @@ -167,7 +167,9 @@ void Test() // Kernel props int max_sm_occupancy; - HipcubDebug(MaxSmOccupancy(max_sm_occupancy, BlockSumKernel, BLOCK_THREADS)); + HIP_CHECK(MaxSmOccupancy(max_sm_occupancy, + BlockSumKernel, + BLOCK_THREADS)); // Copy problem to device hipMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, hipMemcpyHostToDevice); @@ -211,14 +213,14 @@ void Test() // Copy clocks from device clock_t clocks; - HipcubDebug(hipMemcpy(&clocks, d_elapsed, sizeof(clock_t), hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(&clocks, d_elapsed, sizeof(clock_t), hipMemcpyDeviceToHost)); elapsed_clocks += clocks; } // Check for kernel errors and STDIO from the kernel, if any - HipcubDebug(cudaPeekAtLastError()); - HipcubDebug(cudaDeviceSynchronize()); + HIP_CHECK(cudaPeekAtLastError()); + HIP_CHECK(cudaDeviceSynchronize()); // Display timing results float avg_millis = elapsed_millis / g_timing_iterations; @@ -264,7 +266,7 @@ int main(int argc, char** argv) } // Initialize device - HipcubDebug(args.DeviceInit()); + HIP_CHECK(args.DeviceInit()); // Run tests Test<1024, 1, BLOCK_REDUCE_RAKING>(); diff --git a/examples/block/example_block_scan.cu b/examples/block/example_block_scan.cu index 1e67a822..9de523f6 100644 --- a/examples/block/example_block_scan.cu +++ b/examples/block/example_block_scan.cu @@ -196,7 +196,9 @@ void Test() // Kernel props int max_sm_occupancy; - HipcubDebug(MaxSmOccupancy(max_sm_occupancy, BlockPrefixSumKernel, BLOCK_THREADS)); + HIP_CHECK(MaxSmOccupancy(max_sm_occupancy, + BlockPrefixSumKernel, + BLOCK_THREADS)); // Copy problem to device hipMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, hipMemcpyHostToDevice); @@ -246,14 +248,14 @@ void Test() // Copy clocks from device clock_t clocks; - HipcubDebug(hipMemcpy(&clocks, d_elapsed, sizeof(clock_t), hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(&clocks, d_elapsed, sizeof(clock_t), hipMemcpyDeviceToHost)); elapsed_clocks += clocks; } // Check for kernel errors and STDIO from the kernel, if any - HipcubDebug(cudaPeekAtLastError()); - HipcubDebug(cudaDeviceSynchronize()); + HIP_CHECK(cudaPeekAtLastError()); + HIP_CHECK(cudaDeviceSynchronize()); // Display timing results float avg_millis = elapsed_millis / g_timing_iterations; @@ -300,7 +302,7 @@ int main(int argc, char** argv) } // Initialize device - HipcubDebug(args.DeviceInit()); + HIP_CHECK(args.DeviceInit()); // Run tests Test<1024, 1, BLOCK_SCAN_RAKING>(); diff --git a/examples/device/example_device_partition_flagged.cpp b/examples/device/example_device_partition_flagged.cpp index b520e4d7..cd3e72ac 100644 --- a/examples/device/example_device_partition_flagged.cpp +++ b/examples/device/example_device_partition_flagged.cpp @@ -166,7 +166,7 @@ int main(int argc, char** argv) } // Initialize device - HipcubDebug(args.DeviceInit()); + HIP_CHECK(args.DeviceInit()); // Allocate host arrays int *h_in = new int[num_items]; @@ -185,27 +185,40 @@ int main(int argc, char** argv) int *d_in = NULL; unsigned char *d_flags = NULL; - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items)); - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(unsigned char) * num_items)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(unsigned char) * num_items)); // Initialize device input - HipcubDebug(hipMemcpy(d_in, h_in, sizeof(int) * num_items, hipMemcpyHostToDevice)); - HipcubDebug(hipMemcpy(d_flags, h_flags, sizeof(unsigned char) * num_items, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_in, h_in, sizeof(int) * num_items, hipMemcpyHostToDevice)); + HIP_CHECK( + hipMemcpy(d_flags, h_flags, sizeof(unsigned char) * num_items, hipMemcpyHostToDevice)); // Allocate device output array and num selected int *d_out = NULL; int *d_num_selected_out = NULL; - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items)); - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int))); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int))); // Allocate temporary storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; - HipcubDebug(hipcub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items)); - HipcubDebug(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); + HIP_CHECK(hipcub::DevicePartition::Flagged(d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_out, + d_num_selected_out, + num_items)); + HIP_CHECK(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Run - HipcubDebug(hipcub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items)); + HIP_CHECK(hipcub::DevicePartition::Flagged(d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_out, + d_num_selected_out, + num_items)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose); @@ -215,13 +228,20 @@ int main(int argc, char** argv) AssertEquals(0, compare); // Cleanup - if (h_in) delete[] h_in; - if (h_reference) delete[] h_reference; - if (d_out) HipcubDebug(g_allocator.DeviceFree(d_out)); - if (d_num_selected_out) HipcubDebug(g_allocator.DeviceFree(d_num_selected_out)); - if (d_temp_storage) HipcubDebug(g_allocator.DeviceFree(d_temp_storage)); - if (d_in) HipcubDebug(g_allocator.DeviceFree(d_in)); - if (d_flags) HipcubDebug(g_allocator.DeviceFree(d_flags)); + if(h_in) + delete[] h_in; + if(h_reference) + delete[] h_reference; + if(d_out) + HIP_CHECK(g_allocator.DeviceFree(d_out)); + if(d_num_selected_out) + HIP_CHECK(g_allocator.DeviceFree(d_num_selected_out)); + if(d_temp_storage) + HIP_CHECK(g_allocator.DeviceFree(d_temp_storage)); + if(d_in) + HIP_CHECK(g_allocator.DeviceFree(d_in)); + if(d_flags) + HIP_CHECK(g_allocator.DeviceFree(d_flags)); printf("\n\n"); diff --git a/examples/device/example_device_partition_if.cpp b/examples/device/example_device_partition_if.cpp index b0e5b6e0..9568ef5b 100644 --- a/examples/device/example_device_partition_if.cpp +++ b/examples/device/example_device_partition_if.cpp @@ -174,7 +174,7 @@ int main(int argc, char** argv) } // Initialize device - HipcubDebug(args.DeviceInit()); + HIP_CHECK(args.DeviceInit()); // Allocate host arrays int *h_in = new int[num_items]; @@ -199,25 +199,37 @@ int main(int argc, char** argv) // Allocate problem device arrays int *d_in = NULL; - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items)); // Initialize device input - HipcubDebug(hipMemcpy(d_in, h_in, sizeof(int) * num_items, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_in, h_in, sizeof(int) * num_items, hipMemcpyHostToDevice)); // Allocate device output array and num selected int *d_out = NULL; int *d_num_selected_out = NULL; - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items)); - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int))); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int))); // Allocate temporary storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; - HipcubDebug(hipcub::DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op)); - HipcubDebug(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); + HIP_CHECK(hipcub::DevicePartition::If(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + d_num_selected_out, + num_items, + select_op)); + HIP_CHECK(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Run - HipcubDebug(hipcub::DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op)); + HIP_CHECK(hipcub::DevicePartition::If(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + d_num_selected_out, + num_items, + select_op)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose); @@ -227,12 +239,18 @@ int main(int argc, char** argv) AssertEquals(0, compare); // Cleanup - if (h_in) delete[] h_in; - if (h_reference) delete[] h_reference; - if (d_in) HipcubDebug(g_allocator.DeviceFree(d_in)); - if (d_out) HipcubDebug(g_allocator.DeviceFree(d_out)); - if (d_num_selected_out) HipcubDebug(g_allocator.DeviceFree(d_num_selected_out)); - if (d_temp_storage) HipcubDebug(g_allocator.DeviceFree(d_temp_storage)); + if(h_in) + delete[] h_in; + if(h_reference) + delete[] h_reference; + if(d_in) + HIP_CHECK(g_allocator.DeviceFree(d_in)); + if(d_out) + HIP_CHECK(g_allocator.DeviceFree(d_out)); + if(d_num_selected_out) + HIP_CHECK(g_allocator.DeviceFree(d_num_selected_out)); + if(d_temp_storage) + HIP_CHECK(g_allocator.DeviceFree(d_temp_storage)); printf("\n\n"); diff --git a/examples/device/example_device_radix_sort.cpp b/examples/device/example_device_radix_sort.cpp index 4d49d23e..c74e1476 100644 --- a/examples/device/example_device_radix_sort.cpp +++ b/examples/device/example_device_radix_sort.cpp @@ -62,8 +62,8 @@ hipcub::CachingDeviceAllocator g_allocator; // Caching allocator for device me //--------------------------------------------------------------------- /** - * Simple key-value pairing for floating point types. Distinguishes - * between positive and negative zero. + * Simple key-value pairing for floating point types. + * Treats positive and negative zero as equivalent. */ struct Pair { @@ -72,18 +72,7 @@ struct Pair bool operator<(const Pair &b) const { - if (key < b.key) - return true; - - if (key > b.key) - return false; - - // Return true if key is negative zero and b.key is positive zero - unsigned int key_bits = static_cast(key); - unsigned int b_key_bits = static_cast(b.key); - unsigned int HIGH_BIT = 1u << 31; - - return ((key_bits & HIGH_BIT) != 0) && ((b_key_bits & HIGH_BIT) == 0); + return key < b.key; } }; @@ -159,7 +148,7 @@ int main(int argc, char** argv) } // Initialize device - HipcubDebug(args.DeviceInit()); + HIP_CHECK(args.DeviceInit()); printf("hipcub::DeviceRadixSort::SortPairs() %d items (%d-byte keys %d-byte values)\n", num_items, int(sizeof(float)), int(sizeof(int))); @@ -177,24 +166,38 @@ int main(int argc, char** argv) // Allocate device arrays DoubleBuffer d_keys; DoubleBuffer d_values; - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(float) * num_items)); - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(float) * num_items)); - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(int) * num_items)); - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(int) * num_items)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(float) * num_items)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(float) * num_items)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(int) * num_items)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(int) * num_items)); // Allocate temporary storage size_t temp_storage_bytes = 0; void *d_temp_storage = NULL; - HipcubDebug(hipcub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items)); - HipcubDebug(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); + HIP_CHECK(hipcub::DeviceRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items)); + HIP_CHECK(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Initialize device arrays - HipcubDebug(hipMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, hipMemcpyHostToDevice)); - HipcubDebug(hipMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_keys.d_buffers[d_keys.selector], + h_keys, + sizeof(float) * num_items, + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_values.d_buffers[d_values.selector], + h_values, + sizeof(int) * num_items, + hipMemcpyHostToDevice)); // Run - HipcubDebug(hipcub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items)); + HIP_CHECK(hipcub::DeviceRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference_keys, d_keys.Current(), num_items, true, g_verbose); @@ -205,16 +208,25 @@ int main(int argc, char** argv) AssertEquals(0, compare); // Cleanup - if (h_keys) delete[] h_keys; - if (h_reference_keys) delete[] h_reference_keys; - if (h_values) delete[] h_values; - if (h_reference_values) delete[] h_reference_values; - - if (d_keys.d_buffers[0]) HipcubDebug(g_allocator.DeviceFree(d_keys.d_buffers[0])); - if (d_keys.d_buffers[1]) HipcubDebug(g_allocator.DeviceFree(d_keys.d_buffers[1])); - if (d_values.d_buffers[0]) HipcubDebug(g_allocator.DeviceFree(d_values.d_buffers[0])); - if (d_values.d_buffers[1]) HipcubDebug(g_allocator.DeviceFree(d_values.d_buffers[1])); - if (d_temp_storage) HipcubDebug(g_allocator.DeviceFree(d_temp_storage)); + if(h_keys) + delete[] h_keys; + if(h_reference_keys) + delete[] h_reference_keys; + if(h_values) + delete[] h_values; + if(h_reference_values) + delete[] h_reference_values; + + if(d_keys.d_buffers[0]) + HIP_CHECK(g_allocator.DeviceFree(d_keys.d_buffers[0])); + if(d_keys.d_buffers[1]) + HIP_CHECK(g_allocator.DeviceFree(d_keys.d_buffers[1])); + if(d_values.d_buffers[0]) + HIP_CHECK(g_allocator.DeviceFree(d_values.d_buffers[0])); + if(d_values.d_buffers[1]) + HIP_CHECK(g_allocator.DeviceFree(d_values.d_buffers[1])); + if(d_temp_storage) + HIP_CHECK(g_allocator.DeviceFree(d_temp_storage)); printf("\n\n"); diff --git a/examples/device/example_device_reduce.cpp b/examples/device/example_device_reduce.cpp index e34bab21..4695461a 100644 --- a/examples/device/example_device_reduce.cpp +++ b/examples/device/example_device_reduce.cpp @@ -124,7 +124,7 @@ int main(int argc, char** argv) } // Initialize device - HipcubDebug(args.DeviceInit()); + HIP_CHECK(args.DeviceInit()); printf("hipcub::DeviceReduce::Sum() %d items (%d-byte elements)\n", num_items, (int) sizeof(int)); @@ -140,23 +140,25 @@ int main(int argc, char** argv) // Allocate problem device arrays int *d_in = NULL; - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items)); // Initialize device input - HipcubDebug(hipMemcpy(d_in, h_in, sizeof(int) * num_items, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_in, h_in, sizeof(int) * num_items, hipMemcpyHostToDevice)); // Allocate device output array int *d_out = NULL; - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * 1)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * 1)); // Request and allocate temporary storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; - HipcubDebug(hipcub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items)); - HipcubDebug(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); + HIP_CHECK( + hipcub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items)); + HIP_CHECK(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Run - HipcubDebug(hipcub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items)); + HIP_CHECK( + hipcub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(&h_reference, d_out, 1, g_verbose, g_verbose); @@ -164,10 +166,14 @@ int main(int argc, char** argv) AssertEquals(0, compare); // Cleanup - if (h_in) delete[] h_in; - if (d_in) HipcubDebug(g_allocator.DeviceFree(d_in)); - if (d_out) HipcubDebug(g_allocator.DeviceFree(d_out)); - if (d_temp_storage) HipcubDebug(g_allocator.DeviceFree(d_temp_storage)); + if(h_in) + delete[] h_in; + if(d_in) + HIP_CHECK(g_allocator.DeviceFree(d_in)); + if(d_out) + HIP_CHECK(g_allocator.DeviceFree(d_out)); + if(d_temp_storage) + HIP_CHECK(g_allocator.DeviceFree(d_temp_storage)); printf("\n\n"); diff --git a/examples/device/example_device_scan.cpp b/examples/device/example_device_scan.cpp index 48a1a694..0d229723 100644 --- a/examples/device/example_device_scan.cpp +++ b/examples/device/example_device_scan.cpp @@ -130,7 +130,7 @@ int main(int argc, char** argv) } // Initialize device - HipcubDebug(args.DeviceInit()); + HIP_CHECK(args.DeviceInit()); printf("hipcub::DeviceScan::ExclusiveSum %d items (%d-byte elements)\n", num_items, (int) sizeof(int)); @@ -146,23 +146,31 @@ int main(int argc, char** argv) // Allocate problem device arrays int *d_in = NULL; - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items)); // Initialize device input - HipcubDebug(hipMemcpy(d_in, h_in, sizeof(int) * num_items, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_in, h_in, sizeof(int) * num_items, hipMemcpyHostToDevice)); // Allocate device output array int *d_out = NULL; - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items)); // Allocate temporary storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; - HipcubDebug(hipcub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items)); - HipcubDebug(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); + HIP_CHECK(hipcub::DeviceScan::ExclusiveSum(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items)); + HIP_CHECK(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Run - HipcubDebug(hipcub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items)); + HIP_CHECK(hipcub::DeviceScan::ExclusiveSum(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose); @@ -170,11 +178,16 @@ int main(int argc, char** argv) AssertEquals(0, compare); // Cleanup - if (h_in) delete[] h_in; - if (h_reference) delete[] h_reference; - if (d_in) HipcubDebug(g_allocator.DeviceFree(d_in)); - if (d_out) HipcubDebug(g_allocator.DeviceFree(d_out)); - if (d_temp_storage) HipcubDebug(g_allocator.DeviceFree(d_temp_storage)); + if(h_in) + delete[] h_in; + if(h_reference) + delete[] h_reference; + if(d_in) + HIP_CHECK(g_allocator.DeviceFree(d_in)); + if(d_out) + HIP_CHECK(g_allocator.DeviceFree(d_out)); + if(d_temp_storage) + HIP_CHECK(g_allocator.DeviceFree(d_temp_storage)); printf("\n\n"); diff --git a/examples/device/example_device_select_flagged.cpp b/examples/device/example_device_select_flagged.cpp index e3ea78d0..d63d53d5 100644 --- a/examples/device/example_device_select_flagged.cpp +++ b/examples/device/example_device_select_flagged.cpp @@ -167,7 +167,7 @@ int main(int argc, char** argv) } // Initialize device - HipcubDebug(args.DeviceInit()); + HIP_CHECK(args.DeviceInit()); // Allocate host arrays int *h_in = new int[num_items]; @@ -186,27 +186,40 @@ int main(int argc, char** argv) int *d_in = NULL; unsigned char *d_flags = NULL; - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items)); - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(unsigned char) * num_items)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(unsigned char) * num_items)); // Initialize device input - HipcubDebug(hipMemcpy(d_in, h_in, sizeof(int) * num_items, hipMemcpyHostToDevice)); - HipcubDebug(hipMemcpy(d_flags, h_flags, sizeof(unsigned char) * num_items, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_in, h_in, sizeof(int) * num_items, hipMemcpyHostToDevice)); + HIP_CHECK( + hipMemcpy(d_flags, h_flags, sizeof(unsigned char) * num_items, hipMemcpyHostToDevice)); // Allocate device output array and num selected int *d_out = NULL; int *d_num_selected_out = NULL; - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items)); - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int))); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int))); // Allocate temporary storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; - HipcubDebug(hipcub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items)); - HipcubDebug(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); + HIP_CHECK(hipcub::DeviceSelect::Flagged(d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_out, + d_num_selected_out, + num_items)); + HIP_CHECK(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Run - HipcubDebug(hipcub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items)); + HIP_CHECK(hipcub::DeviceSelect::Flagged(d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_out, + d_num_selected_out, + num_items)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose); @@ -216,13 +229,20 @@ int main(int argc, char** argv) AssertEquals(0, compare); // Cleanup - if (h_in) delete[] h_in; - if (h_reference) delete[] h_reference; - if (d_out) HipcubDebug(g_allocator.DeviceFree(d_out)); - if (d_num_selected_out) HipcubDebug(g_allocator.DeviceFree(d_num_selected_out)); - if (d_temp_storage) HipcubDebug(g_allocator.DeviceFree(d_temp_storage)); - if (d_in) HipcubDebug(g_allocator.DeviceFree(d_in)); - if (d_flags) HipcubDebug(g_allocator.DeviceFree(d_flags)); + if(h_in) + delete[] h_in; + if(h_reference) + delete[] h_reference; + if(d_out) + HIP_CHECK(g_allocator.DeviceFree(d_out)); + if(d_num_selected_out) + HIP_CHECK(g_allocator.DeviceFree(d_num_selected_out)); + if(d_temp_storage) + HIP_CHECK(g_allocator.DeviceFree(d_temp_storage)); + if(d_in) + HIP_CHECK(g_allocator.DeviceFree(d_in)); + if(d_flags) + HIP_CHECK(g_allocator.DeviceFree(d_flags)); printf("\n\n"); diff --git a/examples/device/example_device_select_if.cpp b/examples/device/example_device_select_if.cpp index 1447a069..dcc7f417 100644 --- a/examples/device/example_device_select_if.cpp +++ b/examples/device/example_device_select_if.cpp @@ -174,7 +174,7 @@ int main(int argc, char** argv) } // Initialize device - HipcubDebug(args.DeviceInit()); + HIP_CHECK(args.DeviceInit()); // Allocate host arrays int *h_in = new int[num_items]; @@ -199,25 +199,37 @@ int main(int argc, char** argv) // Allocate problem device arrays int *d_in = NULL; - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items)); // Initialize device input - HipcubDebug(hipMemcpy(d_in, h_in, sizeof(int) * num_items, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_in, h_in, sizeof(int) * num_items, hipMemcpyHostToDevice)); // Allocate device output array and num selected int *d_out = NULL; int *d_num_selected_out = NULL; - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items)); - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int))); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int))); // Allocate temporary storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; - HipcubDebug(hipcub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op)); - HipcubDebug(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); + HIP_CHECK(hipcub::DeviceSelect::If(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + d_num_selected_out, + num_items, + select_op)); + HIP_CHECK(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Run - HipcubDebug(hipcub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op)); + HIP_CHECK(hipcub::DeviceSelect::If(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + d_num_selected_out, + num_items, + select_op)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose); @@ -227,12 +239,18 @@ int main(int argc, char** argv) AssertEquals(0, compare); // Cleanup - if (h_in) delete[] h_in; - if (h_reference) delete[] h_reference; - if (d_in) HipcubDebug(g_allocator.DeviceFree(d_in)); - if (d_out) HipcubDebug(g_allocator.DeviceFree(d_out)); - if (d_num_selected_out) HipcubDebug(g_allocator.DeviceFree(d_num_selected_out)); - if (d_temp_storage) HipcubDebug(g_allocator.DeviceFree(d_temp_storage)); + if(h_in) + delete[] h_in; + if(h_reference) + delete[] h_reference; + if(d_in) + HIP_CHECK(g_allocator.DeviceFree(d_in)); + if(d_out) + HIP_CHECK(g_allocator.DeviceFree(d_out)); + if(d_num_selected_out) + HIP_CHECK(g_allocator.DeviceFree(d_num_selected_out)); + if(d_temp_storage) + HIP_CHECK(g_allocator.DeviceFree(d_temp_storage)); printf("\n\n"); diff --git a/examples/device/example_device_select_unique.cpp b/examples/device/example_device_select_unique.cpp index 5fe6ec52..2be49d86 100644 --- a/examples/device/example_device_select_unique.cpp +++ b/examples/device/example_device_select_unique.cpp @@ -161,7 +161,7 @@ int main(int argc, char** argv) } // Initialize device - HipcubDebug(args.DeviceInit()); + HIP_CHECK(args.DeviceInit()); // Allocate host arrays int* h_in = new int[num_items]; @@ -177,25 +177,35 @@ int main(int argc, char** argv) // Allocate problem device arrays int *d_in = NULL; - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items)); // Initialize device input - HipcubDebug(hipMemcpy(d_in, h_in, sizeof(int) * num_items, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_in, h_in, sizeof(int) * num_items, hipMemcpyHostToDevice)); // Allocate device output array and num selected int *d_out = NULL; int *d_num_selected_out = NULL; - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items)); - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int))); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int))); // Allocate temporary storage void *d_temp_storage = NULL; - size_t temp_storage_bytes = 0; - HipcubDebug(hipcub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items)); - HipcubDebug(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); + size_t temp_storage_bytes = 0; + HIP_CHECK(hipcub::DeviceSelect::Unique(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + d_num_selected_out, + num_items)); + HIP_CHECK(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Run - HipcubDebug(hipcub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items)); + HIP_CHECK(hipcub::DeviceSelect::Unique(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + d_num_selected_out, + num_items)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose); @@ -205,12 +215,18 @@ int main(int argc, char** argv) AssertEquals(0, compare); // Cleanup - if (h_in) delete[] h_in; - if (h_reference) delete[] h_reference; - if (d_in) HipcubDebug(g_allocator.DeviceFree(d_in)); - if (d_out) HipcubDebug(g_allocator.DeviceFree(d_out)); - if (d_num_selected_out) HipcubDebug(g_allocator.DeviceFree(d_num_selected_out)); - if (d_temp_storage) HipcubDebug(g_allocator.DeviceFree(d_temp_storage)); + if(h_in) + delete[] h_in; + if(h_reference) + delete[] h_reference; + if(d_in) + HIP_CHECK(g_allocator.DeviceFree(d_in)); + if(d_out) + HIP_CHECK(g_allocator.DeviceFree(d_out)); + if(d_num_selected_out) + HIP_CHECK(g_allocator.DeviceFree(d_num_selected_out)); + if(d_temp_storage) + HIP_CHECK(g_allocator.DeviceFree(d_temp_storage)); printf("\n\n"); diff --git a/examples/device/example_device_sort_find_non_trivial_runs.cpp b/examples/device/example_device_sort_find_non_trivial_runs.cpp index d0a1150d..afbdfed8 100644 --- a/examples/device/example_device_sort_find_non_trivial_runs.cpp +++ b/examples/device/example_device_sort_find_non_trivial_runs.cpp @@ -229,7 +229,7 @@ int main(int argc, char** argv) } // Initialize device - HipcubDebug(args.DeviceInit()); + HIP_CHECK(args.DeviceInit()); // Allocate host arrays (problem and reference solution) @@ -259,30 +259,51 @@ int main(int argc, char** argv) // Allocate and initialize device arrays for sorting DoubleBuffer d_keys; DoubleBuffer d_values; - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(Key) * num_items)); - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(Key) * num_items)); - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(Value) * num_items)); - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(Value) * num_items)); - - HipcubDebug(hipMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, hipMemcpyHostToDevice)); - HipcubDebug(hipMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, hipMemcpyHostToDevice)); + HIP_CHECK( + g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(Key) * num_items)); + HIP_CHECK( + g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(Key) * num_items)); + HIP_CHECK( + g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(Value) * num_items)); + HIP_CHECK( + g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(Value) * num_items)); + + HIP_CHECK(hipMemcpy(d_keys.d_buffers[d_keys.selector], + h_keys, + sizeof(float) * num_items, + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_values.d_buffers[d_values.selector], + h_values, + sizeof(int) * num_items, + hipMemcpyHostToDevice)); // Start timer gpu_timer.Start(); // Allocate temporary storage for sorting size_t temp_storage_bytes = 0; - void *d_temp_storage = NULL; - HipcubDebug(hipcub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items)); - HipcubDebug(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); + void* d_temp_storage = NULL; + HIP_CHECK(hipcub::DeviceRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items)); + HIP_CHECK(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Do the sort - HipcubDebug(hipcub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items)); + HIP_CHECK(hipcub::DeviceRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items)); // Free unused buffers and sorting temporary storage - if (d_keys.d_buffers[d_keys.selector ^ 1]) HipcubDebug(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector ^ 1])); - if (d_values.d_buffers[d_values.selector ^ 1]) HipcubDebug(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector ^ 1])); - if (d_temp_storage) HipcubDebug(g_allocator.DeviceFree(d_temp_storage)); + if(d_keys.d_buffers[d_keys.selector ^ 1]) + HIP_CHECK(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector ^ 1])); + if(d_values.d_buffers[d_values.selector ^ 1]) + HIP_CHECK(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector ^ 1])); + if(d_temp_storage) + HIP_CHECK(g_allocator.DeviceFree(d_temp_storage)); // Start timer gpu_rle_timer.Start(); @@ -291,34 +312,33 @@ int main(int argc, char** argv) int *d_offests_out = NULL; int *d_lengths_out = NULL; int *d_num_runs = NULL; - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_offests_out, sizeof(int) * num_items)); - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_lengths_out, sizeof(int) * num_items)); - HipcubDebug(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int) * 1)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_offests_out, sizeof(int) * num_items)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_lengths_out, sizeof(int) * num_items)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int) * 1)); // Allocate temporary storage for isolating non-trivial runs d_temp_storage = NULL; - HipcubDebug(hipcub::DeviceRunLengthEncode::NonTrivialRuns( - d_temp_storage, - temp_storage_bytes, - d_keys.d_buffers[d_keys.selector], - d_offests_out, - d_lengths_out, - d_num_runs, - num_items)); - HipcubDebug(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); + HIP_CHECK(hipcub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, + temp_storage_bytes, + d_keys.d_buffers[d_keys.selector], + d_offests_out, + d_lengths_out, + d_num_runs, + num_items)); + HIP_CHECK(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Do the isolation - HipcubDebug(hipcub::DeviceRunLengthEncode::NonTrivialRuns( - d_temp_storage, - temp_storage_bytes, - d_keys.d_buffers[d_keys.selector], - d_offests_out, - d_lengths_out, - d_num_runs, - num_items)); + HIP_CHECK(hipcub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, + temp_storage_bytes, + d_keys.d_buffers[d_keys.selector], + d_offests_out, + d_lengths_out, + d_num_runs, + num_items)); // Free keys buffer - if (d_keys.d_buffers[d_keys.selector]) HipcubDebug(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector])); + if(d_keys.d_buffers[d_keys.selector]) + HIP_CHECK(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector])); // // Hypothetically do stuff with the original key-indices corresponding to non-trivial runs of identical keys @@ -354,11 +374,16 @@ int main(int argc, char** argv) // GPU cleanup - if (d_values.d_buffers[d_values.selector]) HipcubDebug(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector])); - if (d_offests_out) HipcubDebug(g_allocator.DeviceFree(d_offests_out)); - if (d_lengths_out) HipcubDebug(g_allocator.DeviceFree(d_lengths_out)); - if (d_num_runs) HipcubDebug(g_allocator.DeviceFree(d_num_runs)); - if (d_temp_storage) HipcubDebug(g_allocator.DeviceFree(d_temp_storage)); + if(d_values.d_buffers[d_values.selector]) + HIP_CHECK(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector])); + if(d_offests_out) + HIP_CHECK(g_allocator.DeviceFree(d_offests_out)); + if(d_lengths_out) + HIP_CHECK(g_allocator.DeviceFree(d_lengths_out)); + if(d_num_runs) + HIP_CHECK(g_allocator.DeviceFree(d_num_runs)); + if(d_temp_storage) + HIP_CHECK(g_allocator.DeviceFree(d_temp_storage)); } // Host cleanup diff --git a/examples/example_utils.hpp b/examples/example_utils.hpp index ee59b875..7ad91634 100644 --- a/examples/example_utils.hpp +++ b/examples/example_utils.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2021-2023, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2021-2024, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -40,6 +40,18 @@ #define AssertEquals(a, b) if ((a) != (b)) { std::cerr << "\n(" << __FILE__ << ": " << __LINE__ << ")\n"; exit(1);} +#define HIP_CHECK(condition) \ + do \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } \ + while(0); + template T CoutCast(T val) { return val; } @@ -248,7 +260,7 @@ struct CommandLineArgs error = hipSetDevice(dev); if (error) break; - hipMemGetInfo(&device_free_physmem, &device_total_physmem); + HIP_CHECK(hipMemGetInfo(&device_free_physmem, &device_total_physmem)); // int ptx_version = 0; // error = hipcub::PtxVersion(ptx_version); @@ -418,7 +430,7 @@ int CompareDeviceResults( T *h_data = (T*) malloc(num_items * sizeof(T)); // Copy data back - hipMemcpy(h_data, d_data, sizeof(T) * num_items, hipMemcpyDeviceToHost); + HIP_CHECK(hipMemcpy(h_data, d_data, sizeof(T) * num_items, hipMemcpyDeviceToHost)); // Display data if (display_data) @@ -463,8 +475,8 @@ int CompareDeviceDeviceResults( T *h_data = (T*) malloc(num_items * sizeof(T)); // Copy data back - hipMemcpy(h_reference, d_reference, sizeof(T) * num_items, hipMemcpyDeviceToHost); - hipMemcpy(h_data, d_data, sizeof(T) * num_items, hipMemcpyDeviceToHost); + HIP_CHECK(hipMemcpy(h_reference, d_reference, sizeof(T) * num_items, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(h_data, d_data, sizeof(T) * num_items, hipMemcpyDeviceToHost)); // Display data if (display_data) { @@ -614,31 +626,31 @@ struct GpuTimer GpuTimer() { - hipEventCreate(&start); - hipEventCreate(&stop); + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); } ~GpuTimer() { - hipEventDestroy(start); - hipEventDestroy(stop); + HIP_CHECK(hipEventDestroy(start)); + HIP_CHECK(hipEventDestroy(stop)); } void Start() { - hipEventRecord(start, 0); + HIP_CHECK(hipEventRecord(start, 0)); } void Stop() { - hipEventRecord(stop, 0); + HIP_CHECK(hipEventRecord(stop, 0)); } float ElapsedMillis() { float elapsed; - hipEventSynchronize(stop); - hipEventElapsedTime(&elapsed, start, stop); + HIP_CHECK(hipEventSynchronize(stop)); + HIP_CHECK(hipEventElapsedTime(&elapsed, start, stop)); return elapsed; } }; diff --git a/hipcub/include/hipcub/backend/cub/device/device_select.hpp b/hipcub/include/hipcub/backend/cub/device/device_select.hpp index c3e82fa7..b0ba368d 100644 --- a/hipcub/include/hipcub/backend/cub/device/device_select.hpp +++ b/hipcub/include/hipcub/backend/cub/device/device_select.hpp @@ -53,7 +53,7 @@ class DeviceSelect FlagIterator d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + int64_t num_items, hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceSelect::Flagged(d_temp_storage, @@ -77,7 +77,7 @@ class DeviceSelect FlagIterator d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + int64_t num_items, hipStream_t stream, bool debug_synchronous) { @@ -99,7 +99,7 @@ class DeviceSelect IteratorT d_data, FlagIterator d_flags, NumSelectedIteratorT d_num_selected_out, - int num_items, + int64_t num_items, hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceSelect::Flagged(d_temp_storage, @@ -119,7 +119,7 @@ class DeviceSelect IteratorT d_data, FlagIterator d_flags, NumSelectedIteratorT d_num_selected_out, - int num_items, + int64_t num_items, hipStream_t stream, bool debug_synchronous) { @@ -143,7 +143,7 @@ class DeviceSelect InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + int64_t num_items, SelectOp select_op, hipStream_t stream = 0) { @@ -167,7 +167,7 @@ class DeviceSelect InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + int64_t num_items, SelectOp select_op, hipStream_t stream, bool debug_synchronous) @@ -189,7 +189,7 @@ class DeviceSelect size_t& temp_storage_bytes, IteratorT d_data, NumSelectedIteratorT d_num_selected_out, - int num_items, + int64_t num_items, SelectOp select_op, hipStream_t stream = 0) { @@ -208,7 +208,7 @@ class DeviceSelect size_t& temp_storage_bytes, IteratorT d_data, NumSelectedIteratorT d_num_selected_out, - int num_items, + int64_t num_items, SelectOp select_op, hipStream_t stream, bool debug_synchronous) @@ -223,6 +223,113 @@ class DeviceSelect stream); } + template + HIPCUB_RUNTIME_FUNCTION + static hipError_t FlaggedIf(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + FlagIterator d_flags, + OutputIteratorT d_out, + NumSelectedIteratorT d_num_selected_out, + int64_t num_items, + SelectOp select_op, + hipStream_t stream = 0) + { + return hipCUDAErrorTohipError(::cub::DeviceSelect::FlaggedIf(d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_out, + d_num_selected_out, + num_items, + select_op, + stream)); + } + + template + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t FlaggedIf(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + FlagIterator d_flags, + OutputIteratorT d_out, + NumSelectedIteratorT d_num_selected_out, + int64_t num_items, + SelectOp select_op, + hipStream_t stream, + bool debug_synchronous) + { + HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); + return FlaggedIf(d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_out, + d_num_selected_out, + num_items, + select_op, + stream); + } + + template + HIPCUB_RUNTIME_FUNCTION + static hipError_t FlaggedIf(void* d_temp_storage, + size_t& temp_storage_bytes, + IteratorT d_data, + FlagIterator d_flags, + NumSelectedIteratorT d_num_selected_out, + int64_t num_items, + SelectOp select_op, + hipStream_t stream = 0) + { + return hipCUDAErrorTohipError(::cub::DeviceSelect::FlaggedIf(d_temp_storage, + temp_storage_bytes, + d_data, + d_flags, + + d_num_selected_out, + num_items, + select_op, + stream)); + } + + template + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t FlaggedIf(void* d_temp_storage, + size_t& temp_storage_bytes, + IteratorT d_data, + FlagIterator d_flags, + NumSelectedIteratorT d_num_selected_out, + int64_t num_items, + SelectOp select_op, + hipStream_t stream, + bool debug_synchronous) + { + HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); + return FlaggedIf(d_temp_storage, + temp_storage_bytes, + d_data, + d_flags, + d_num_selected_out, + num_items, + select_op, + stream); + } + template HIPCUB_RUNTIME_FUNCTION static hipError_t Unique(void* d_temp_storage, @@ -230,7 +337,7 @@ class DeviceSelect InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + int64_t num_items, hipStream_t stream = 0) { return hipCUDAErrorTohipError(::cub::DeviceSelect::Unique(d_temp_storage, @@ -249,7 +356,7 @@ class DeviceSelect InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + int64_t num_items, hipStream_t stream, bool debug_synchronous) { diff --git a/hipcub/include/hipcub/backend/cub/iterator/tex_obj_input_iterator.hpp b/hipcub/include/hipcub/backend/cub/iterator/tex_obj_input_iterator.hpp new file mode 100644 index 00000000..4e144a62 --- /dev/null +++ b/hipcub/include/hipcub/backend/cub/iterator/tex_obj_input_iterator.hpp @@ -0,0 +1,57 @@ +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef HIPCUB_CUB_ITERATOR_TEX_OBJ_INPUT_ITERATOR_HPP_ +#define HIPCUB_CUB_ITERATOR_TEX_OBJ_INPUT_ITERATOR_HPP_ + +#include "../../../config.hpp" + +#include + +BEGIN_HIPCUB_NAMESPACE + +template +class TexObjInputIterator : public ::cub::TexObjInputIterator +{ +public: + template + inline hipError_t + BindTexture(Qualified* ptr, size_t bytes = size_t(-1), size_t texture_offset = 0) + { + return hipCUDAErrorTohipError( + ::cub::TexObjInputIterator::BindTexture(ptr, bytes, texture_offset)); + } + + inline hipError_t UnbindTexture() + { + return hipCUDAErrorTohipError(::cub::TexObjInputIterator::UnbindTexture()); + } + + HIPCUB_HOST_DEVICE inline TexObjInputIterator() : ::cub::TexObjInputIterator() {} + + HIPCUB_HOST_DEVICE inline TexObjInputIterator( + const ::cub::TexObjInputIterator other) + : ::cub::TexObjInputIterator(other) + {} +}; + +END_HIPCUB_NAMESPACE + +#endif // HIPCUB_CUB_ITERATOR_TEX_OBJ_INPUT_ITERATOR_HPP_ diff --git a/hipcub/include/hipcub/backend/cub/iterator/tex_ref_input_iterator.hpp b/hipcub/include/hipcub/backend/cub/iterator/tex_ref_input_iterator.hpp new file mode 100644 index 00000000..b1d53239 --- /dev/null +++ b/hipcub/include/hipcub/backend/cub/iterator/tex_ref_input_iterator.hpp @@ -0,0 +1,64 @@ +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef HIPCUB_CUB_ITERATOR_TEX_REF_INPUT_ITERATOR_HPP_ +#define HIPCUB_CUB_ITERATOR_TEX_REF_INPUT_ITERATOR_HPP_ + +#include "../../../config.hpp" + +#include + +BEGIN_HIPCUB_NAMESPACE + +template +class TexRefInputIterator : public ::cub::TexRefInputIterator +{ +public: + template + inline hipError_t + BindTexture(Qualified* ptr, size_t bytes = size_t(-1), size_t texture_offset = 0) + { + return hipCUDAErrorTohipError( + ::cub::TexRefInputIterator::BindTexture(ptr, + bytes, + texture_offset)); + } + + inline hipError_t UnbindTexture() + { + return hipCUDAErrorTohipError( + ::cub::TexRefInputIterator::UnbindTexture()); + } + + HIPCUB_HOST_DEVICE inline TexRefInputIterator() + : ::cub::TexRefInputIterator() + {} + + HIPCUB_HOST_DEVICE inline TexRefInputIterator( + const ::cub::TexRefInputIterator other) + : ::cub::TexRefInputIterator(other) + {} +}; + +END_HIPCUB_NAMESPACE + +#endif // HIPCUB_CUB_ITERATOR_TEX_REF_INPUT_ITERATOR_HPP_ diff --git a/hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp b/hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp index 778ebd5b..91ef6ce4 100644 --- a/hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp +++ b/hipcub/include/hipcub/backend/rocprim/device/device_histogram.hpp @@ -39,6 +39,68 @@ BEGIN_HIPCUB_NAMESPACE +namespace detail +{ +template + HIPCUB_HOST_DEVICE +HIPCUB_FORCEINLINE bool may_overflow(LevelT /* lower_level */, + LevelT /* upper_level */, + CommonT /* num_bins */, + ::std::false_type /* is_integral */) +{ + return false; +} + +// Returns true if the bin computation for a given combination of range (max_level - min_level) +// and number of bins may overflow. +template + HIPCUB_HOST_DEVICE +HIPCUB_FORCEINLINE bool may_overflow(LevelT lower_level, + LevelT upper_level, + CommonT num_bins, + ::std::true_type /* is_integral */) +{ + return static_cast(upper_level - lower_level) + > (::std::numeric_limits::max() / static_cast(num_bins)); +} + +template +struct int_arithmetic_t +{ + using type = ::std::conditional_t< + sizeof(SampleT) + sizeof(CommonT) <= sizeof(uint32_t), + uint32_t, +#if HIPCUB_IS_INT128_ENABLED + ::std::conditional_t<(::std::is_same::value + || ::std::is_same::value), + CommonT, + uint64_t> +#else + uint64_t +#endif + >; +}; + +// If potential overflow is detected, returns hipErrorInvalidValue, otherwise hipSuccess. +template +HIPCUB_HOST_DEVICE +HIPCUB_FORCEINLINE hipError_t check_overflow(LevelT lower_level, LevelT upper_level, int num_levels) +{ + using sample_type = typename std::iterator_traits::value_type; + using common_type = typename std::common_type::type; + using int_arithmetic_t = typename int_arithmetic_t::type; + + if(may_overflow(lower_level, + upper_level, + static_cast(num_levels - 1), + ::std::is_integral{})) + { + return hipErrorInvalidValue; + } + return hipSuccess; +} +} // namespace detail + struct DeviceHistogram { template @@ -52,6 +114,11 @@ struct DeviceHistogram OffsetT num_samples, hipStream_t stream = 0) { + if(detail::check_overflow(lower_level, upper_level, num_levels) + != hipSuccess) + { + return hipErrorInvalidValue; + } return ::rocprim::histogram_even(d_temp_storage, temp_storage_bytes, d_samples, @@ -165,6 +232,13 @@ struct DeviceHistogram unsigned int levels[NUM_ACTIVE_CHANNELS]; for(unsigned int channel = 0; channel < NUM_ACTIVE_CHANNELS; channel++) { + if(detail::check_overflow(lower_level[channel], + upper_level[channel], + num_levels[channel]) + != hipSuccess) + { + return hipErrorInvalidValue; + } levels[channel] = num_levels[channel]; } return ::rocprim::multi_histogram_even( diff --git a/hipcub/include/hipcub/backend/rocprim/device/device_select.hpp b/hipcub/include/hipcub/backend/rocprim/device/device_select.hpp index a333d9e5..e15a742b 100644 --- a/hipcub/include/hipcub/backend/rocprim/device/device_select.hpp +++ b/hipcub/include/hipcub/backend/rocprim/device/device_select.hpp @@ -55,7 +55,7 @@ class DeviceSelect FlagIterator d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + int64_t num_items, hipStream_t stream = 0) { return ::rocprim::select(d_temp_storage, @@ -80,7 +80,7 @@ class DeviceSelect FlagIterator d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + int64_t num_items, hipStream_t stream, bool debug_synchronous) { @@ -102,7 +102,7 @@ class DeviceSelect IteratorT d_data, FlagIterator d_flags, NumSelectedIteratorT d_num_selected_out, - int num_items, + int64_t num_items, hipStream_t stream = 0) { return Flagged(d_temp_storage, @@ -122,7 +122,7 @@ class DeviceSelect IteratorT d_data, FlagIterator d_flags, NumSelectedIteratorT d_num_selected_out, - int num_items, + int64_t num_items, hipStream_t stream, bool debug_synchronous) { @@ -146,7 +146,7 @@ class DeviceSelect InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + int64_t num_items, SelectOp select_op, hipStream_t stream = 0) { @@ -171,7 +171,7 @@ class DeviceSelect InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + int64_t num_items, SelectOp select_op, hipStream_t stream, bool debug_synchronous) @@ -193,7 +193,7 @@ class DeviceSelect size_t& temp_storage_bytes, IteratorT d_data, NumSelectedIteratorT d_num_selected_out, - int num_items, + int64_t num_items, SelectOp select_op, hipStream_t stream = 0) { @@ -213,7 +213,7 @@ class DeviceSelect size_t& temp_storage_bytes, IteratorT d_data, NumSelectedIteratorT d_num_selected_out, - int num_items, + int64_t num_items, SelectOp select_op, hipStream_t stream, bool debug_synchronous) @@ -228,6 +228,114 @@ class DeviceSelect stream); } + template + HIPCUB_RUNTIME_FUNCTION + static hipError_t FlaggedIf(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + FlagIterator d_flags, + OutputIteratorT d_out, + NumSelectedIteratorT d_num_selected_out, + int64_t num_items, + SelectOp select_op, + hipStream_t stream = 0) + { + return ::rocprim::select(d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_out, + d_num_selected_out, + num_items, + select_op, + stream, + HIPCUB_DETAIL_DEBUG_SYNC_VALUE); + } + + template + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t FlaggedIf(void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + FlagIterator d_flags, + OutputIteratorT d_out, + NumSelectedIteratorT d_num_selected_out, + int64_t num_items, + SelectOp select_op, + hipStream_t stream, + bool debug_synchronous) + { + HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); + return FlaggedIf(d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_out, + d_num_selected_out, + num_items, + select_op, + stream); + } + + template + HIPCUB_RUNTIME_FUNCTION + static hipError_t FlaggedIf(void* d_temp_storage, + size_t& temp_storage_bytes, + IteratorT d_data, + FlagIterator d_flags, + NumSelectedIteratorT d_num_selected_out, + int64_t num_items, + SelectOp select_op, + hipStream_t stream = 0) + { + return FlaggedIf(d_temp_storage, + temp_storage_bytes, + d_data, + d_flags, + d_data, + d_num_selected_out, + num_items, + select_op, + stream); + } + + template + HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION + static hipError_t FlaggedIf(void* d_temp_storage, + size_t& temp_storage_bytes, + IteratorT d_data, + FlagIterator d_flags, + NumSelectedIteratorT d_num_selected_out, + int64_t num_items, + SelectOp select_op, + hipStream_t stream, + bool debug_synchronous) + { + HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS(); + return FlaggedIf(d_temp_storage, + temp_storage_bytes, + d_data, + d_flags, + d_num_selected_out, + num_items, + select_op, + stream); + } + template HIPCUB_RUNTIME_FUNCTION static hipError_t Unique(void* d_temp_storage, @@ -235,7 +343,7 @@ class DeviceSelect InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + int64_t num_items, hipStream_t stream = 0) { return ::rocprim::unique(d_temp_storage, @@ -256,7 +364,7 @@ class DeviceSelect InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + int64_t num_items, hipStream_t stream, bool debug_synchronous) { diff --git a/hipcub/include/hipcub/backend/rocprim/grid/grid_barrier.hpp b/hipcub/include/hipcub/backend/rocprim/grid/grid_barrier.hpp index 2f7d2062..4c04f8cf 100644 --- a/hipcub/include/hipcub/backend/rocprim/grid/grid_barrier.hpp +++ b/hipcub/include/hipcub/backend/rocprim/grid/grid_barrier.hpp @@ -165,7 +165,7 @@ class GridBarrierLifetime : public GridBarrier */ virtual ~GridBarrierLifetime() { - HostReset(); + (void)HostReset(); } diff --git a/hipcub/include/hipcub/backend/rocprim/grid/grid_queue.hpp b/hipcub/include/hipcub/backend/rocprim/grid/grid_queue.hpp index 59b19495..9538f738 100644 --- a/hipcub/include/hipcub/backend/rocprim/grid/grid_queue.hpp +++ b/hipcub/include/hipcub/backend/rocprim/grid/grid_queue.hpp @@ -116,63 +116,53 @@ class GridQueue OffsetT fill_size, hipStream_t stream = 0) { - hipError_t result = hipErrorUnknown; (void)stream; d_counters[FILL] = fill_size; d_counters[DRAIN] = 0; - result = hipSuccess; - return result; + return hipSuccess; } - HIPCUB_HOST hipError_t FillAndResetDrain( - OffsetT fill_size, - hipStream_t stream = 0) + HIPCUB_HOST + hipError_t FillAndResetDrain(OffsetT fill_size, hipStream_t stream = 0) { - hipError_t result = hipErrorUnknown; OffsetT counters[2]; counters[FILL] = fill_size; counters[DRAIN] = 0; - result = HipcubDebug(hipMemcpyAsync(d_counters, - counters, - sizeof(OffsetT) * 2, - hipMemcpyHostToDevice, - stream)); - return result; + return HipcubDebug(hipMemcpyAsync(d_counters, + counters, + sizeof(OffsetT) * 2, + hipMemcpyHostToDevice, + stream)); } /// This operation resets the drain so that it may advance to meet the existing fill-size. To be called by the host or by a kernel prior to that which will be draining. HIPCUB_DEVICE hipError_t ResetDrain(hipStream_t stream = 0) { - hipError_t result = hipErrorUnknown; (void)stream; d_counters[DRAIN] = 0; - result = hipSuccess; - return result; + + return hipSuccess; } - HIPCUB_HOST hipError_t ResetDrain(hipStream_t stream = 0) + HIPCUB_HOST + hipError_t ResetDrain(hipStream_t stream = 0) { - hipError_t result = hipErrorUnknown; - result = HipcubDebug(hipMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream)); - return result; + return HipcubDebug(hipMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream)); } /// This operation resets the fill counter. To be called by the host or by a kernel prior to that which will be filling. HIPCUB_DEVICE hipError_t ResetFill(hipStream_t stream = 0) { - hipError_t result = hipErrorUnknown; (void)stream; d_counters[FILL] = 0; - result = hipSuccess; - return result; + return hipSuccess; } - HIPCUB_HOST hipError_t ResetFill(hipStream_t stream = 0) + HIPCUB_HOST + hipError_t ResetFill(hipStream_t stream = 0) { - hipError_t result = hipErrorUnknown; - result = HipcubDebug(hipMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream)); - return result; + return HipcubDebug(hipMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream)); } @@ -181,24 +171,20 @@ class GridQueue OffsetT &fill_size, hipStream_t stream = 0) { - hipError_t result = hipErrorUnknown; (void)stream; fill_size = d_counters[FILL]; - result = hipSuccess; - return result; + + return hipSuccess; } - HIPCUB_HOST hipError_t FillSize( - OffsetT &fill_size, - hipStream_t stream = 0) + HIPCUB_HOST + hipError_t FillSize(OffsetT& fill_size, hipStream_t stream = 0) { - hipError_t result = hipErrorUnknown; - result = HipcubDebug(hipMemcpyAsync(&fill_size, - d_counters + FILL, - sizeof(OffsetT), - hipMemcpyDeviceToHost, - stream)); - return result; + return HipcubDebug(hipMemcpyAsync(&fill_size, + d_counters + FILL, + sizeof(OffsetT), + hipMemcpyDeviceToHost, + stream)); } diff --git a/hipcub/include/hipcub/config.hpp b/hipcub/include/hipcub/config.hpp index 75433460..77636b03 100644 --- a/hipcub/include/hipcub/config.hpp +++ b/hipcub/include/hipcub/config.hpp @@ -94,6 +94,7 @@ END_HIPCUB_NAMESPACE #define HIPCUB_HOST __host__ #define HIPCUB_DEVICE __device__ #define HIPCUB_HOST_DEVICE __host__ __device__ +#define HIPCUB_FORCEINLINE __forceinline__ #define HIPCUB_SHARED_MEMORY __shared__ // Helper macros to disable warnings in clang diff --git a/hipcub/include/hipcub/iterator/tex_obj_input_iterator.hpp b/hipcub/include/hipcub/iterator/tex_obj_input_iterator.hpp index f78f4932..08bdc1d0 100644 --- a/hipcub/include/hipcub/iterator/tex_obj_input_iterator.hpp +++ b/hipcub/include/hipcub/iterator/tex_obj_input_iterator.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2021, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2021-2024, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -33,9 +33,8 @@ #ifdef __HIP_PLATFORM_AMD__ #include "../backend/rocprim/iterator/tex_obj_input_iterator.hpp" #elif defined(__HIP_PLATFORM_NVIDIA__) + #include "../backend/cub/iterator/tex_obj_input_iterator.hpp" #include "../config.hpp" - #include #endif #endif // HIPCUB_ITERATOR_DISCARD_OUTPUT__HPP_ - diff --git a/hipcub/include/hipcub/iterator/tex_ref_input_iterator.hpp b/hipcub/include/hipcub/iterator/tex_ref_input_iterator.hpp index c442c5b6..05e9dab0 100644 --- a/hipcub/include/hipcub/iterator/tex_ref_input_iterator.hpp +++ b/hipcub/include/hipcub/iterator/tex_ref_input_iterator.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2021, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2021-2024, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -33,10 +33,8 @@ #ifdef __HIP_PLATFORM_AMD__ #include "../backend/rocprim/iterator/tex_ref_input_iterator.hpp" #elif defined(__HIP_PLATFORM_NVIDIA__) + #include "../backend/cub/iterator/tex_ref_input_iterator.hpp" #include "../config.hpp" - #include #endif #endif // HIPCUB_ITERATOR_DISCARD_OUTPUT__HPP_ - - diff --git a/test/extra/CMakeLists.txt b/test/extra/CMakeLists.txt index bbe36048..7b41f181 100644 --- a/test/extra/CMakeLists.txt +++ b/test/extra/CMakeLists.txt @@ -42,7 +42,7 @@ include(VerifyCompiler) # CUB (only for CUDA platform) if(HIP_COMPILER STREQUAL "nvcc") - set(CCCL_MINIMUM_VERSION 2.4.0) + set(CCCL_MINIMUM_VERSION 2.5.0) if(NOT DOWNLOAD_CUB) find_package(CUB ${CCCL_MINIMUM_VERSION} CONFIG) find_package(Thrust ${CCCL_MINIMUM_VERSION} CONFIG) @@ -112,10 +112,18 @@ endif() find_package(hipcub CONFIG REQUIRED) # Build CXX flags -set(CMAKE_CXX_STANDARD 14) +if (NOT DEFINED CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) +if (CMAKE_CXX_STANDARD EQUAL 14) + message(WARNING "C++14 will be deprecated in the next major release") +elseif(NOT CMAKE_CXX_STANDARD EQUAL 17) + message(FATAL_ERROR "Only C++14 and C++17 are supported") +endif() + # Enable testing (ctest) enable_testing() diff --git a/test/hipcub/half.hpp b/test/hipcub/half.hpp index 2ceee491..954ba26a 100644 --- a/test/hipcub/half.hpp +++ b/test/hipcub/half.hpp @@ -235,20 +235,31 @@ struct half_t /// Multiply __host__ __device__ __forceinline__ - half_t operator*(const half_t &other) + half_t + operator*(const half_t& other) const { return half_t(float(*this) * float(other)); } /// Divide - __host__ __device__ __forceinline__ half_t operator/(const half_t& other) const + __host__ __device__ __forceinline__ + half_t& + operator/=(const half_t& other) + { + return *this = half_t(float(*this) / float(other)); + } + + friend __host__ __device__ __forceinline__ + half_t + operator/(half_t self, const half_t& other) { - return half_t(float(*this) / float(other)); + return self /= other; } /// Add __host__ __device__ __forceinline__ - half_t operator+(const half_t &other) + half_t + operator+(const half_t& other) const { return half_t(float(*this) + float(other)); } diff --git a/test/hipcub/test_hipcub_device_adjacent_difference.cpp b/test/hipcub/test_hipcub_device_adjacent_difference.cpp index 76e395a6..a9b73b70 100644 --- a/test/hipcub/test_hipcub_device_adjacent_difference.cpp +++ b/test/hipcub/test_hipcub_device_adjacent_difference.cpp @@ -207,9 +207,7 @@ TYPED_TEST(HipcubDeviceAdjacentDifference, SubtractLeftCopy) op, stream)); -#ifdef __HIP_PLATFORM_AMD__ ASSERT_GT(temporary_storage_bytes, 0U); -#endif void* d_temporary_storage; HIP_CHECK( @@ -457,8 +455,8 @@ TYPED_TEST(HipcubDeviceAdjacentDifferenceLargeTests, LargeIndicesAndOpOnce) ASSERT_EQ(flags[0], 1); ASSERT_EQ(flags[1], 1); - hipFree(d_temp_storage); - hipFree(d_flags); + HIP_CHECK(hipFree(d_temp_storage)); + HIP_CHECK(hipFree(d_flags)); } } } \ No newline at end of file diff --git a/test/hipcub/test_hipcub_device_for.cpp b/test/hipcub/test_hipcub_device_for.cpp index 0c1b3c9e..5cd00e81 100644 --- a/test/hipcub/test_hipcub_device_for.cpp +++ b/test/hipcub/test_hipcub_device_for.cpp @@ -148,7 +148,7 @@ TYPED_TEST(HipcubDeviceForTests, ForEach) test_utils::cleanupGraphHelper(graph, graph_instance); } - hipFree(d_input); + HIP_CHECK(hipFree(d_input)); } } @@ -238,7 +238,7 @@ TEST(HipcubDeviceForTestsTempStore, ForEachTempStore) // Check if have same number of odd numbers ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(h_count, expected)); - hipFree(d_input); + HIP_CHECK(hipFree(d_input)); } } } @@ -316,7 +316,7 @@ TYPED_TEST(HipcubDeviceForTests, ForEachN) test_utils::cleanupGraphHelper(graph, graph_instance); } - hipFree(d_input); + HIP_CHECK(hipFree(d_input)); } } diff --git a/test/hipcub/test_hipcub_device_histogram.cpp b/test/hipcub/test_hipcub_device_histogram.cpp index 15563c9b..3754214a 100644 --- a/test/hipcub/test_hipcub_device_histogram.cpp +++ b/test/hipcub/test_hipcub_device_histogram.cpp @@ -30,6 +30,7 @@ // hipcub API #include "hipcub/device/device_histogram.hpp" +#include "hipcub/iterator/counting_input_iterator.hpp" #include "hipcub/iterator/transform_input_iterator.hpp" // rows, columns, (row_stride - columns * Channels) @@ -139,7 +140,9 @@ typedef ::testing::Types, params1, params1, params1, - params1> + params1, + // Regression: sample_type = int and level_type = size_t + params1> Params1; TYPED_TEST_SUITE(HipcubDeviceHistogramEven, Params1); @@ -340,6 +343,105 @@ TYPED_TEST(HipcubDeviceHistogramEven, Even) } } +// Test HistogramEven overflow +template +class HipcubDeviceHistogramEvenOverflow : public ::testing::Test +{ +public: + using params = Params; +}; + +typedef ::testing::Types, + params1, + params1, + params1, + params1, + params1> + Params1Overflow; + +TYPED_TEST_SUITE(HipcubDeviceHistogramEvenOverflow, Params1Overflow); + +TYPED_TEST(HipcubDeviceHistogramEvenOverflow, EvenOverflow) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using sample_type = typename TestFixture::params::sample_type; + using counter_type = uint32_t; + using level_type = sample_type; + constexpr unsigned int bins = TestFixture::params::bins; + + // native host types + using native_level_type = test_utils::convert_to_fundamental_t; + + const native_level_type n_lower_level = 0; + const native_level_type n_upper_level = std::numeric_limits::max(); + + level_type lower_level = test_utils::convert_to_device(n_lower_level); + level_type upper_level = test_utils::convert_to_device(n_upper_level); + + hipStream_t stream = 0; // default + + const size_t size = 1000; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); + + // Generate data + auto d_input = hipcub::CountingInputIterator{0UL}; + counter_type* d_histogram; + HIP_CHECK(test_common_utils::hipMallocHelper(&d_histogram, bins * sizeof(counter_type))); + + size_t temporary_storage_bytes = 0; + hipError_t error = hipcub::DeviceHistogram::HistogramEven(nullptr, + temporary_storage_bytes, + d_input, + d_histogram, + bins + 1, + lower_level, + upper_level, + int(size), + stream); + + // Allocate a some amount of temp storage bytes in case of an overflow of the bin + // computation. Note that the subsequent algorithm invocation will also fail. + if(error == hipErrorInvalidValue) + { + temporary_storage_bytes = 3; + } + + void* d_temporary_storage; + HIP_CHECK( + test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_bytes)); + + error = hipcub::DeviceHistogram::HistogramEven(d_temporary_storage, + temporary_storage_bytes, + d_input, + d_histogram, + bins + 1, + lower_level, + upper_level, + int(size), + stream); + + HIP_CHECK(hipFree(d_temporary_storage)); + HIP_CHECK(hipFree(d_histogram)); + + if(bins == 1 || sizeof(sample_type) <= 4UL) + { + ASSERT_EQ(error, hipSuccess); + } + else + { + ASSERT_EQ(error, hipErrorInvalidValue); + } + } +} + template, params2, params2, - params2> + params2, + // Regression: sample_type = int and level_type = size_t + params2> Params2; TYPED_TEST_SUITE(HipcubDeviceHistogramRange, Params2); @@ -629,7 +733,9 @@ typedef ::testing::Types, params3, params3, params3, - params3> + params3, + // Regression: sample_type = int and level_type = size_t + params3> Params3; TYPED_TEST_SUITE(HipcubDeviceHistogramMultiEven, Params3); @@ -951,7 +1057,9 @@ typedef ::testing::Types< params4, params4, params4, - params4> + params4, + // Regression: sample_type = int and level_type = size_t + params4> Params4; TYPED_TEST_SUITE(HipcubDeviceHistogramMultiRange, Params4); diff --git a/test/hipcub/test_hipcub_device_partition.cpp b/test/hipcub/test_hipcub_device_partition.cpp index a34682a6..53764a70 100644 --- a/test/hipcub/test_hipcub_device_partition.cpp +++ b/test/hipcub/test_hipcub_device_partition.cpp @@ -212,11 +212,11 @@ TYPED_TEST(HipcubDevicePartitionTests, Flagged) test_utils::cleanupGraphHelper(graph, graph_instance); } - hipFree(d_input); - hipFree(d_flags); - hipFree(d_output); - hipFree(d_selected_count_output); - hipFree(d_temp_storage); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_flags)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_selected_count_output)); + HIP_CHECK(hipFree(d_temp_storage)); } } @@ -389,10 +389,10 @@ TYPED_TEST(HipcubDevicePartitionTests, If) test_utils::cleanupGraphHelper(graph, graph_instance); } - hipFree(d_input); - hipFree(d_output); - hipFree(d_selected_count_output); - hipFree(d_temp_storage); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_selected_count_output)); + HIP_CHECK(hipFree(d_temp_storage)); } } @@ -575,12 +575,12 @@ TYPED_TEST(HipcubDevicePartitionTests, IfThreeWay) test_utils::cleanupGraphHelper(graph, graph_instance); } - hipFree(d_input); - hipFree(d_first_output); - hipFree(d_second_output); - hipFree(d_unselected_output); - hipFree(d_selected_counts); - hipFree(d_temp_storage); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_first_output)); + HIP_CHECK(hipFree(d_second_output)); + HIP_CHECK(hipFree(d_unselected_output)); + HIP_CHECK(hipFree(d_selected_counts)); + HIP_CHECK(hipFree(d_temp_storage)); } } diff --git a/test/hipcub/test_hipcub_device_radix_sort.hpp b/test/hipcub/test_hipcub_device_radix_sort.hpp index f74b8666..8c4eaee3 100644 --- a/test/hipcub/test_hipcub_device_radix_sort.hpp +++ b/test/hipcub/test_hipcub_device_radix_sort.hpp @@ -43,6 +43,7 @@ if(error == hipErrorOutOfMemory) \ { \ std::cout << "Out of memory. Skipping size = " << size << std::endl; \ + (void)hipGetLastError(); /*reset error code to hipSuccess*/ \ break; \ } \ if(error != hipSuccess) \ diff --git a/test/hipcub/test_hipcub_device_run_length_encode.cpp b/test/hipcub/test_hipcub_device_run_length_encode.cpp index e29876a2..d25c5a1a 100644 --- a/test/hipcub/test_hipcub_device_run_length_encode.cpp +++ b/test/hipcub/test_hipcub_device_run_length_encode.cpp @@ -33,13 +33,18 @@ #include "test_utils_data_generation.hpp" -template +template struct params { using key_type = Key; using count_type = Count; static constexpr unsigned int min_segment_length = MinSegmentLength; static constexpr unsigned int max_segment_length = MaxSegmentLength; + static constexpr bool use_graphs = UseGraphs; }; template @@ -60,7 +65,10 @@ typedef ::testing::Types, params, params, params, - params> + params, + // Test graph capture + params, + params> Params; TYPED_TEST_SUITE(HipcubDeviceRunLengthEncode, Params); @@ -78,6 +86,13 @@ TYPED_TEST(HipcubDeviceRunLengthEncode, Encode) std::uniform_real_distribution, std::uniform_int_distribution>::type; + hipStream_t stream = 0; // default + if(TestFixture::params::use_graphs) + { + // Default stream does not support hipGraph stream capture, so create one + HIP_CHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); + } + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { unsigned int seed_value @@ -87,7 +102,6 @@ TYPED_TEST(HipcubDeviceRunLengthEncode, Encode) for(size_t size : test_utils::get_sizes(seed_value)) { SCOPED_TRACE(testing::Message() << "with size= " << size); - hipStream_t stream = 0; // default // Generate data and calculate expected results std::vector unique_expected; @@ -155,6 +169,12 @@ TYPED_TEST(HipcubDeviceRunLengthEncode, Encode) HIP_CHECK( test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_bytes)); + hipGraph_t graph; + if(TestFixture::params::use_graphs) + { + graph = test_utils::createGraphHelper(stream); + } + HIP_CHECK(hipcub::DeviceRunLengthEncode::Encode(d_temporary_storage, temporary_storage_bytes, d_input, @@ -164,6 +184,12 @@ TYPED_TEST(HipcubDeviceRunLengthEncode, Encode) size, stream)); + hipGraphExec_t graph_instance; + if(TestFixture::params::use_graphs) + { + graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + } + HIP_CHECK(hipFree(d_temporary_storage)); std::vector unique_output(runs_count_expected); @@ -196,8 +222,18 @@ TYPED_TEST(HipcubDeviceRunLengthEncode, Encode) ASSERT_EQ(unique_output[i], unique_expected[i]); ASSERT_EQ(counts_output[i], counts_expected[i]); } + + if(TestFixture::params::use_graphs) + { + test_utils::cleanupGraphHelper(graph, graph_instance); + } } } + + if(TestFixture::params::use_graphs) + { + HIP_CHECK(hipStreamDestroy(stream)); + } } TYPED_TEST(HipcubDeviceRunLengthEncode, NonTrivialRuns) @@ -214,6 +250,13 @@ TYPED_TEST(HipcubDeviceRunLengthEncode, NonTrivialRuns) std::uniform_real_distribution, std::uniform_int_distribution>::type; + hipStream_t stream = 0; // default + if(TestFixture::params::use_graphs) + { + // Default stream does not support hipGraph stream capture, so create one + HIP_CHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); + } + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { unsigned int seed_value @@ -223,7 +266,6 @@ TYPED_TEST(HipcubDeviceRunLengthEncode, NonTrivialRuns) for(size_t size : test_utils::get_sizes(seed_value)) { SCOPED_TRACE(testing::Message() << "with size= " << size); - hipStream_t stream = 0; // default // Generate data and calculate expected results std::vector offsets_expected; @@ -306,6 +348,12 @@ TYPED_TEST(HipcubDeviceRunLengthEncode, NonTrivialRuns) HIP_CHECK( test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_bytes)); + hipGraph_t graph; + if(TestFixture::params::use_graphs) + { + graph = test_utils::createGraphHelper(stream); + } + HIP_CHECK(hipcub::DeviceRunLengthEncode::NonTrivialRuns(d_temporary_storage, temporary_storage_bytes, d_input, @@ -315,6 +363,12 @@ TYPED_TEST(HipcubDeviceRunLengthEncode, NonTrivialRuns) size, stream)); + hipGraphExec_t graph_instance; + if(TestFixture::params::use_graphs) + { + graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + } + HIP_CHECK(hipFree(d_temporary_storage)); std::vector offsets_output(runs_count_expected); @@ -350,6 +404,16 @@ TYPED_TEST(HipcubDeviceRunLengthEncode, NonTrivialRuns) ASSERT_EQ(offsets_output[i], offsets_expected[i]); ASSERT_EQ(counts_output[i], counts_expected[i]); } + + if(TestFixture::params::use_graphs) + { + test_utils::cleanupGraphHelper(graph, graph_instance); + } } } + + if(TestFixture::params::use_graphs) + { + HIP_CHECK(hipStreamDestroy(stream)); + } } diff --git a/test/hipcub/test_hipcub_device_select.cpp b/test/hipcub/test_hipcub_device_select.cpp index 2d361202..e4cac445 100644 --- a/test/hipcub/test_hipcub_device_select.cpp +++ b/test/hipcub/test_hipcub_device_select.cpp @@ -501,6 +501,181 @@ TYPED_TEST(HipcubDeviceSelectTests, SelectOp) } } +TYPED_TEST(HipcubDeviceSelectTests, FlaggedIf) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using T = typename TestFixture::input_type; + using U = typename TestFixture::output_type; + using F = typename TestFixture::flag_type; + + constexpr bool inplace = std::is_same::value; + + hipStream_t stream = 0; // default + if(TestFixture::use_graphs) + { + // Default stream does not support hipGraph stream capture, so create one + HIP_CHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); + } + + TestSelectOp select_flag_op; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); + + for(size_t size : test_utils::get_sizes(seed_value)) + { + SCOPED_TRACE(testing::Message() << "with size= " << size); + + // Generate data + std::vector input + = test_utils::get_random_data(size, + test_utils::convert_to_device(1), + test_utils::convert_to_device(100), + seed_value); + std::vector flags = test_utils::get_random_data(size, + static_cast(0), + static_cast(100), + seed_value + seed_value_addition); + + T* d_input; + F* d_flags; + U* d_output; + unsigned int* d_selected_count_output; + HIP_CHECK(test_common_utils::hipMallocHelper(&d_input, input.size() * sizeof(T))); + HIP_CHECK(test_common_utils::hipMallocHelper(&d_flags, flags.size() * sizeof(F))); + if HIPCUB_IF_CONSTEXPR(!inplace) + { + HIP_CHECK(test_common_utils::hipMallocHelper(&d_output, input.size() * sizeof(U))); + } + HIP_CHECK( + test_common_utils::hipMallocHelper(&d_selected_count_output, sizeof(unsigned int))); + HIP_CHECK( + hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK( + hipMemcpy(d_flags, flags.data(), flags.size() * sizeof(F), hipMemcpyHostToDevice)); + + // Calculate expected results on host + std::vector expected; + expected.reserve(input.size()); + for(size_t i = 0; i < input.size(); i++) + { + if(select_flag_op(flags[i])) + { + expected.push_back(input[i]); + } + } + + auto call = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) + { + if HIPCUB_IF_CONSTEXPR(inplace) + { + HIP_CHECK(hipcub::DeviceSelect::FlaggedIf(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_flags, + d_selected_count_output, + input.size(), + select_flag_op, + stream)); + } + else + { + HIP_CHECK(hipcub::DeviceSelect::FlaggedIf(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_flags, + d_output, + d_selected_count_output, + input.size(), + select_flag_op, + stream)); + } + }; + + // temp storage + size_t temp_storage_size_bytes; + void* d_temp_storage = nullptr; + // Get size of d_temp_storage + call(d_temp_storage, temp_storage_size_bytes); + + // temp_storage_size_bytes must be >0 + ASSERT_GT(temp_storage_size_bytes, 0U); + + // allocate temporary storage + HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); + + hipGraph_t graph; + if(TestFixture::use_graphs) + { + graph = test_utils::createGraphHelper(stream); + } + + // Run + call(d_temp_storage, temp_storage_size_bytes); + + hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) + { + graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + } + + HIP_CHECK(hipDeviceSynchronize()); + + // Check if number of selected value is as expected + unsigned int selected_count_output = 0; + HIP_CHECK(hipMemcpy(&selected_count_output, + d_selected_count_output, + sizeof(selected_count_output), + hipMemcpyDeviceToHost)); + ASSERT_EQ(selected_count_output, expected.size()); + + // Check if output values are as expected + std::vector output(input.size()); + if HIPCUB_IF_CONSTEXPR(inplace) + { + HIP_CHECK(hipMemcpy(output.data(), + d_input, + output.size() * sizeof(U), + hipMemcpyDeviceToHost)); + } + else + { + HIP_CHECK(hipMemcpy(output.data(), + d_output, + output.size() * sizeof(U), + hipMemcpyDeviceToHost)); + } + + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected, expected.size())); + + if(TestFixture::use_graphs) + { + test_utils::cleanupGraphHelper(graph, graph_instance); + } + + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_flags)); + if(!inplace) + { + HIP_CHECK(hipFree(d_output)); + } + HIP_CHECK(hipFree(d_selected_count_output)); + HIP_CHECK(hipFree(d_temp_storage)); + } + } + + if(TestFixture::use_graphs) + { + HIP_CHECK(hipStreamDestroy(stream)); + } +} + std::vector get_discontinuity_probabilities() { std::vector probabilities = {0.0001, 0.05, 0.25, 0.5, 0.75, 0.95}; @@ -717,6 +892,123 @@ TEST(HipcubDeviceSelectTests, UniqueDiscardOutputIterator) } } +template +struct TestLargeIndicesSelectOp +{ + T max_value; + __host__ __device__ + inline bool + operator()(const T& value) const + { + return test_utils::less()(value, T(max_value)); + } +}; + +class HipcubDeviceSelectLargeIndicesTests : public ::testing::TestWithParam +{ +public: + const bool debug_synchronous = false; +}; + +INSTANTIATE_TEST_SUITE_P(HipcubDeviceSelectLargeIndicesTest, + HipcubDeviceSelectLargeIndicesTests, + ::testing::Values(2048, 9643, 32768, 38713, 38713)); + +TEST_P(HipcubDeviceSelectLargeIndicesTests, LargeIndicesSelectOp) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using T = size_t; // input_type + using U = size_t; // output_type + using selected_count_type = size_t; + + hipStream_t stream = 0; // default stream + + const auto selected_size = GetParam(); + + for(size_t size : test_utils::get_large_sizes(0)) + { + SCOPED_TRACE(testing::Message() << "with size= " << size); + +// Support for large indices in DeviceSelect is not implemented in CUB yet. Disable test meanwhile. +#ifdef __HIP_PLATFORM_NVIDIA__ + std::cout << "Test disabled for large sizes until support is present in CUB" << std::endl; + GTEST_SKIP(); +#endif + + // Generate data + hipcub::CountingInputIterator d_input(0); + U* d_output; + selected_count_type* d_selected_count_output; + selected_count_type expected_output_size = selected_size; + TestLargeIndicesSelectOp select_op{expected_output_size}; + HIP_CHECK(test_common_utils::hipMallocHelper(&d_output, + sizeof(d_output[0]) * expected_output_size)); + HIP_CHECK(test_common_utils::hipMallocHelper(&d_selected_count_output, + sizeof(d_selected_count_output[0]))); + + // Calculate expected results on host + std::vector expected_output(expected_output_size); + std::iota(expected_output.begin(), expected_output.end(), U(0)); + + // Temp storage + size_t temp_storage_size_bytes; + void* d_temp_storage = nullptr; + // Get the size of d_temp_storage + HIP_CHECK(hipcub::DeviceSelect::If(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + size, + select_op, + stream)); + + HIP_CHECK(hipDeviceSynchronize()); + + // temp_storage_size_bytes must be >0 + ASSERT_GT(temp_storage_size_bytes, 0); + + // Allocate temporary storage + HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); + + // Run + HIP_CHECK(hipcub::DeviceSelect::If(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + size, + select_op, + stream)); + HIP_CHECK(hipDeviceSynchronize()); + + // Check if number of selected value is as expected + selected_count_type selected_count_output = 0; + HIP_CHECK(hipMemcpy(&selected_count_output, + d_selected_count_output, + sizeof(*d_selected_count_output), + hipMemcpyDeviceToHost)); + ASSERT_EQ(selected_count_output, selected_size); + + // Check if outputs are as expected + std::vector output(expected_output_size); + HIP_CHECK(hipMemcpy(output.data(), + d_output, + sizeof(*d_output) * expected_output_size, + hipMemcpyDeviceToHost)); + + ASSERT_NO_FATAL_FAILURE( + test_utils::assert_eq(output, expected_output, expected_output_size)); + + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_selected_count_output)); + HIP_CHECK(hipFree(d_temp_storage)); + } +} + template params; @@ -186,8 +186,8 @@ TYPED_TEST(HipcubDeviceSpmvTests, Spmv) params.num_rows = csr_matrix.num_rows; params.num_cols = csr_matrix.num_cols; params.num_nonzeros = csr_matrix.num_nonzeros; - params.alpha = alpha; - params.beta = beta; + params.alpha = alpha_const; + params.beta = beta_const; HIP_CHECK(hipMemcpy(params.d_values, csr_matrix.values, sizeof(T) * csr_matrix.num_nonzeros, hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(params.d_row_end_offsets, csr_matrix.row_offsets, sizeof(OffsetType) * (csr_matrix.num_rows + 1), hipMemcpyHostToDevice)); diff --git a/test/hipcub/test_hipcub_grid.cpp b/test/hipcub/test_hipcub_grid.cpp index 9f7f212a..ef64ba2d 100644 --- a/test/hipcub/test_hipcub_grid.cpp +++ b/test/hipcub/test_hipcub_grid.cpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2024, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -198,7 +198,7 @@ __global__ void KernelGridQueueInit(hipcub::GridQueue tile_queue) { if ((threadIdx.x == 0) && (blockIdx.x == 0)) { - tile_queue.ResetDrain(); + (void)tile_queue.ResetDrain(); } } diff --git a/test/hipcub/test_hipcub_iterators.cpp b/test/hipcub/test_hipcub_iterators.cpp index 73c51ae9..7b132858 100644 --- a/test/hipcub/test_hipcub_iterators.cpp +++ b/test/hipcub/test_hipcub_iterators.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -148,13 +148,13 @@ void iterator_test_function(IteratorType d_itr, std::vector &h_reference) IteratorType *h_itrs = (IteratorType*)malloc(sizeof(IteratorType) * 2); T* device_output; - g_allocator.DeviceAllocate((void**)&device_output, output.size() * sizeof(typename decltype(output)::value_type)); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&device_output, output.size() * sizeof(T))); // Run unguarded kernel Kernel<<<1, 1>>>(d_itr, device_output, d_itrs); - hipPeekAtLastError(); - hipDeviceSynchronize(); + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); HIP_CHECK( hipMemcpy( @@ -174,14 +174,14 @@ void iterator_test_function(IteratorType d_itr, std::vector &h_reference) for(size_t i = 0; i < output.size(); i++) { - ASSERT_EQ(output[i], h_reference[i]); + ASSERT_EQ(output[i], h_reference[i]) << i; } IteratorType h_itr = d_itr + 21; ASSERT_TRUE(h_itr == h_itrs[0]); ASSERT_TRUE(d_itr == h_itrs[1]); - g_allocator.DeviceFree(device_output); + HIP_CHECK(g_allocator.DeviceFree(device_output)); } TYPED_TEST_SUITE(HipcubIteratorTests, HipcubIteratorTestsParams); @@ -215,14 +215,14 @@ TYPED_TEST(HipcubIteratorTests, TestCacheModifiedInput) h_reference[7] = h_data[0]; // Value at offset 0; T *d_data = NULL; - g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES)); HIP_CHECK(hipMemcpy(d_data, h_data.data(), TEST_VALUES * sizeof(T), hipMemcpyHostToDevice)); IteratorType d_itr((T*) d_data); iterator_test_function(d_itr, h_reference); - g_allocator.DeviceFree(d_data); + HIP_CHECK(g_allocator.DeviceFree(d_data)); } TYPED_TEST(HipcubIteratorTests, TestConstant) @@ -305,7 +305,7 @@ TYPED_TEST(HipcubIteratorTests, TestTransform) // Allocate device arrays T *d_data = NULL; - g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES)); HIP_CHECK( hipMemcpy( @@ -333,7 +333,7 @@ TYPED_TEST(HipcubIteratorTests, TestTransform) iterator_test_function(d_itr, h_reference); - g_allocator.DeviceFree(d_data); + HIP_CHECK(g_allocator.DeviceFree(d_data)); } TYPED_TEST(HipcubIteratorTests, TestTexObj) @@ -344,62 +344,70 @@ TYPED_TEST(HipcubIteratorTests, TestTexObj) hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, device_id)); std::string deviceName = std::string(props.gcnArchName); - if (deviceName.rfind("gfx94", 0) == 0 || deviceName.rfind("gfx120") == 0) { + if(deviceName.rfind("gfx94", 0) == 0 || deviceName.rfind("gfx120") == 0) + { // This is a gfx94x or gfx120x device, so skip this test GTEST_SKIP() << "Test not run on gfx94x or gfx120x as texture cache API is not supported"; } HIP_CHECK(hipSetDevice(device_id)); - using T = typename TestFixture::input_type; - using CastT = typename TestFixture::input_type; + using T = typename TestFixture::input_type; + using CastT = typename TestFixture::input_type; using IteratorType = hipcub::TexObjInputIterator; // // Test iterator manipulation in kernel // - constexpr uint32_t TEST_VALUES = 11000; - constexpr uint32_t DUMMY_OFFSET = 500; - constexpr uint32_t DUMMY_TEST_VALUES = TEST_VALUES - DUMMY_OFFSET; + constexpr uint32_t TEST_VALUES = 11000; + constexpr uint32_t DUMMY_OFFSET = 500; + constexpr uint32_t DUMMY_TEST_VALUES = TEST_VALUES - DUMMY_OFFSET; - for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { - unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; - //T *h_data = new T[TEST_VALUES]; - std::vector h_data(TEST_VALUES); - std::vector output = test_utils::get_random_data(TEST_VALUES, T(2), T(200), seed_value); + std::vector output + = test_utils::get_random_data(TEST_VALUES, T(2), T(200), seed_value); // Allocate device arrays - T *d_data = NULL; - T *d_dummy = NULL; - g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES); - hipMemcpy(d_data, h_data.data(), sizeof(T) * TEST_VALUES, hipMemcpyHostToDevice); + T* d_data = NULL; + T* d_dummy = NULL; + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES)); + HIP_CHECK(hipMemcpy(d_data, output.data(), sizeof(T) * TEST_VALUES, hipMemcpyHostToDevice)); - g_allocator.DeviceAllocate((void**)&d_dummy, sizeof(T) * DUMMY_TEST_VALUES); - hipMemcpy(d_dummy, h_data.data() + DUMMY_OFFSET, sizeof(T) * DUMMY_TEST_VALUES, hipMemcpyHostToDevice); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_dummy, sizeof(T) * DUMMY_TEST_VALUES)); + HIP_CHECK(hipMemcpy(d_dummy, + output.data() + DUMMY_OFFSET, + sizeof(T) * DUMMY_TEST_VALUES, + hipMemcpyHostToDevice)); // Initialize reference data constexpr uint32_t array_size = 8; - std::vector h_reference(array_size); - h_reference[0] = h_data[0]; // Value at offset 0 - h_reference[1] = h_data[100]; // Value at offset 100 - h_reference[2] = h_data[1000]; // Value at offset 1000 - h_reference[3] = h_data[10000]; // Value at offset 10000 - h_reference[4] = h_data[1]; // Value at offset 1 - h_reference[5] = h_data[21]; // Value at offset 21 - h_reference[6] = h_data[11]; // Value at offset 11 - h_reference[7] = h_data[0]; // Value at offset 0; + std::vector h_reference(array_size); + h_reference[0] = output[0]; // Value at offset 0 + h_reference[1] = output[100]; // Value at offset 100 + h_reference[2] = output[1000]; // Value at offset 1000 + h_reference[3] = output[10000]; // Value at offset 10000 + h_reference[4] = output[1]; // Value at offset 1 + h_reference[5] = output[21]; // Value at offset 21 + h_reference[6] = output[11]; // Value at offset 11 + h_reference[7] = output[0]; // Value at offset 0; // Create and bind obj-based test iterator IteratorType d_obj_itr; - d_obj_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES); + HIP_CHECK(d_obj_itr.BindTexture((CastT*)d_data, sizeof(T) * TEST_VALUES)); + + // Create and bind dummy iterator of same type to check with interference + IteratorType d_obj_itr2; + HIP_CHECK(d_obj_itr2.BindTexture((CastT*)d_dummy, sizeof(T) * DUMMY_TEST_VALUES)); iterator_test_function(d_obj_itr, h_reference); - g_allocator.DeviceFree(d_data); - g_allocator.DeviceFree(d_dummy); + HIP_CHECK(g_allocator.DeviceFree(d_data)); + HIP_CHECK(g_allocator.DeviceFree(d_dummy)); } } @@ -411,66 +419,70 @@ TYPED_TEST(HipcubIteratorTests, TestTexRef) hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, device_id)); std::string deviceName = std::string(props.gcnArchName); - if (deviceName.rfind("gfx94", 0) == 0 || deviceName.rfind("gfx120") == 0) { + if(deviceName.rfind("gfx94", 0) == 0 || deviceName.rfind("gfx120") == 0) + { // This is a gfx94x or gfx120x device, so skip this test GTEST_SKIP() << "Test not run on gfx94x or gfx120x as texture cache API is not supported"; } HIP_CHECK(hipSetDevice(device_id)); - using T = typename TestFixture::input_type; - using CastT = typename TestFixture::input_type; + using T = typename TestFixture::input_type; + using CastT = typename TestFixture::input_type; using IteratorType = hipcub::TexRefInputIterator; // // Test iterator manipulation in kernel // - constexpr uint32_t TEST_VALUES = 11000; - constexpr uint32_t DUMMY_OFFSET = 500; - constexpr uint32_t DUMMY_TEST_VALUES = TEST_VALUES - DUMMY_OFFSET; + constexpr uint32_t TEST_VALUES = 11000; + constexpr uint32_t DUMMY_OFFSET = 500; + constexpr uint32_t DUMMY_TEST_VALUES = TEST_VALUES - DUMMY_OFFSET; - for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { - unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; - //T *h_data = new T[TEST_VALUES]; - std::vector h_data(TEST_VALUES); - std::vector output = test_utils::get_random_data(TEST_VALUES, T(2), T(200), seed_value); + std::vector output + = test_utils::get_random_data(TEST_VALUES, T(2), T(200), seed_value); // Allocate device arrays - T *d_data = NULL; - T *d_dummy = NULL; - g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES); - hipMemcpy(d_data, h_data.data(), sizeof(T) * TEST_VALUES, hipMemcpyHostToDevice); + T* d_data = NULL; + T* d_dummy = NULL; + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES)); + HIP_CHECK(hipMemcpy(d_data, output.data(), sizeof(T) * TEST_VALUES, hipMemcpyHostToDevice)); - g_allocator.DeviceAllocate((void**)&d_dummy, sizeof(T) * DUMMY_TEST_VALUES); - hipMemcpy(d_dummy, h_data.data() + DUMMY_OFFSET, sizeof(T) * DUMMY_TEST_VALUES, hipMemcpyHostToDevice); + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_dummy, sizeof(T) * DUMMY_TEST_VALUES)); + HIP_CHECK(hipMemcpy(d_dummy, + output.data() + DUMMY_OFFSET, + sizeof(T) * DUMMY_TEST_VALUES, + hipMemcpyHostToDevice)); // Initialize reference data constexpr uint32_t array_size = 8; - std::vector h_reference(array_size); - h_reference[0] = h_data[0]; // Value at offset 0 - h_reference[1] = h_data[100]; // Value at offset 100 - h_reference[2] = h_data[1000]; // Value at offset 1000 - h_reference[3] = h_data[10000]; // Value at offset 10000 - h_reference[4] = h_data[1]; // Value at offset 1 - h_reference[5] = h_data[21]; // Value at offset 21 - h_reference[6] = h_data[11]; // Value at offset 11 - h_reference[7] = h_data[0]; // Value at offset 0; + std::vector h_reference(array_size); + h_reference[0] = output[0]; // Value at offset 0 + h_reference[1] = output[100]; // Value at offset 100 + h_reference[2] = output[1000]; // Value at offset 1000 + h_reference[3] = output[10000]; // Value at offset 10000 + h_reference[4] = output[1]; // Value at offset 1 + h_reference[5] = output[21]; // Value at offset 21 + h_reference[6] = output[11]; // Value at offset 11 + h_reference[7] = output[0]; // Value at offset 0; // Create and bind ref-based test iterator IteratorType d_ref_itr; - d_ref_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES); + HIP_CHECK(d_ref_itr.BindTexture((CastT*)d_data, sizeof(T) * TEST_VALUES)); // Create and bind dummy iterator of same type to check with interference IteratorType d_ref_itr2; - d_ref_itr2.BindTexture((CastT*) d_dummy, sizeof(T) * DUMMY_TEST_VALUES); + HIP_CHECK(d_ref_itr2.BindTexture((CastT*)d_dummy, sizeof(T) * DUMMY_TEST_VALUES)); iterator_test_function(d_ref_itr, h_reference); - g_allocator.DeviceFree(d_data); - g_allocator.DeviceFree(d_dummy); + HIP_CHECK(g_allocator.DeviceFree(d_data)); + HIP_CHECK(g_allocator.DeviceFree(d_dummy)); } } @@ -482,58 +494,58 @@ TYPED_TEST(HipcubIteratorTests, TestTexTransform) hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, device_id)); std::string deviceName = std::string(props.gcnArchName); - if (deviceName.rfind("gfx94", 0) == 0 || deviceName.rfind("gfx120") == 0) { + if(deviceName.rfind("gfx94", 0) == 0 || deviceName.rfind("gfx120") == 0) + { // This is a gfx94x or gfx120x device, so skip this test GTEST_SKIP() << "Test not run on gfx94x or gfx120x as texture cache API is not supported"; } HIP_CHECK(hipSetDevice(device_id)); - using T = typename TestFixture::input_type; - using CastT = typename TestFixture::input_type; + using T = typename TestFixture::input_type; using TextureIteratorType = hipcub::TexRefInputIterator; - constexpr uint32_t TEST_VALUES = 11000; + constexpr uint32_t TEST_VALUES = 11000; - for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { - unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; - //T *h_data = new T[TEST_VALUES]; - std::vector h_data(TEST_VALUES); - std::vector output = test_utils::get_random_data(TEST_VALUES, T(2), T(200), seed_value); + std::vector output + = test_utils::get_random_data(TEST_VALUES, T(2), T(200), seed_value); // Allocate device arrays - T *d_data = NULL; - g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES); - hipMemcpy(d_data, h_data.data(), sizeof(T) * TEST_VALUES, hipMemcpyHostToDevice); + T* d_data = NULL; + HIP_CHECK(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES)); + HIP_CHECK(hipMemcpy(d_data, output.data(), sizeof(T) * TEST_VALUES, hipMemcpyHostToDevice)); TransformOp op; // Initialize reference data constexpr uint32_t array_size = 8; - std::vector h_reference(array_size); - h_reference[0] = op(h_data[0]); // Value at offset 0 - h_reference[1] = op(h_data[100]); // Value at offset 100 - h_reference[2] = op(h_data[1000]); // Value at offset 1000 - h_reference[3] = op(h_data[10000]); // Value at offset 10000 - h_reference[4] = op(h_data[1]); // Value at offset 1 - h_reference[5] = op(h_data[21]); // Value at offset 21 - h_reference[6] = op(h_data[11]); // Value at offset 11 - h_reference[7] = op(h_data[0]); // Value at offset 0; + std::vector h_reference(array_size); + h_reference[0] = op(output[0]); // Value at offset 0 + h_reference[1] = op(output[100]); // Value at offset 100 + h_reference[2] = op(output[1000]); // Value at offset 1000 + h_reference[3] = op(output[10000]); // Value at offset 10000 + h_reference[4] = op(output[1]); // Value at offset 1 + h_reference[5] = op(output[21]); // Value at offset 21 + h_reference[6] = op(output[11]); // Value at offset 11 + h_reference[7] = op(output[0]); // Value at offset 0; // Create and bind ref-based test iterator TextureIteratorType d_tex_itr; - d_tex_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES); + HIP_CHECK(d_tex_itr.BindTexture(d_data, sizeof(T) * TEST_VALUES)); // Create transform iterator - hipcub::TransformInputIterator, TextureIteratorType> xform_itr(d_tex_itr, op); + hipcub::TransformInputIterator, TextureIteratorType> xform_itr(d_tex_itr, + op); iterator_test_function< hipcub::TransformInputIterator, TextureIteratorType>, - T> - (xform_itr, h_reference); + T>(xform_itr, h_reference); - g_allocator.DeviceFree(d_data); + HIP_CHECK(g_allocator.DeviceFree(d_data)); } } diff --git a/test/hipcub/test_hipcub_util_ptx.cpp b/test/hipcub/test_hipcub_util_ptx.cpp index 509c2804..4373a70c 100644 --- a/test/hipcub/test_hipcub_util_ptx.cpp +++ b/test/hipcub/test_hipcub_util_ptx.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -223,7 +223,7 @@ TYPED_TEST(HipcubUtilPtxTests, ShuffleUp) << "where index = " << i; } } - hipFree(device_data); + HIP_CHECK(hipFree(device_data)); } } @@ -340,7 +340,7 @@ TYPED_TEST(HipcubUtilPtxTests, ShuffleDown) << "where index = " << i; } } - hipFree(device_data); + HIP_CHECK(hipFree(device_data)); } } @@ -466,8 +466,8 @@ TYPED_TEST(HipcubUtilPtxTests, ShuffleIndex) << "where index = " << i; } - hipFree(device_data); - hipFree(device_src_offsets); + HIP_CHECK(hipFree(device_data)); + HIP_CHECK(hipFree(device_src_offsets)); } } @@ -578,7 +578,7 @@ TEST(HipcubUtilPtxTests, ShuffleUpCustomStruct) << "where index = " << i; } } - hipFree(device_data); + HIP_CHECK(hipFree(device_data)); } } @@ -690,7 +690,7 @@ TEST(HipcubUtilPtxTests, ShuffleUpCustomAlignedStruct) << "where index = " << i; } } - hipFree(device_data); + HIP_CHECK(hipFree(device_data)); } } diff --git a/test/hipcub/test_hipcub_warp_reduce.cpp b/test/hipcub/test_hipcub_warp_reduce.cpp index 819cf1ed..d84a63c9 100644 --- a/test/hipcub/test_hipcub_warp_reduce.cpp +++ b/test/hipcub/test_hipcub_warp_reduce.cpp @@ -23,6 +23,7 @@ #include "common_test_header.hpp" #include "hipcub/warp/warp_reduce.hpp" +#include template< class T, @@ -107,14 +108,11 @@ typedef ::testing::Types< TYPED_TEST_SUITE(HipcubWarpReduceTests, HipcubWarpReduceTestParams); -template< - class T, - unsigned int BlockSize, - unsigned int LogicalWarpSize -> +template __global__ __launch_bounds__(BlockSize) -void warp_reduce_kernel(T* device_input, T* device_output) +auto warp_reduce_kernel(T* device_input, T* device_output) -> + typename std::enable_if_t> { // Minimum size is 1 constexpr unsigned int warps_no = test_utils::max(BlockSize / LogicalWarpSize, 1u); @@ -134,6 +132,16 @@ void warp_reduce_kernel(T* device_input, T* device_output) } } +template +__global__ +__launch_bounds__(BlockSize) +auto warp_reduce_kernel(T* /*device_input*/, T* /*device_output*/) -> + typename std::enable_if_t> +{ + // This kernel should never be actually called; tests are filtered out at runtime + // if the device does not support the LogicalWarpSize +} + TYPED_TEST(HipcubWarpReduceTests, Reduce) { int device_id = test_common_utils::obtain_device_from_ctest(); @@ -257,14 +265,11 @@ TYPED_TEST(HipcubWarpReduceTests, Reduce) } } -template< - class T, - unsigned int BlockSize, - unsigned int LogicalWarpSize -> +template __global__ __launch_bounds__(BlockSize) -void warp_reduce_valid_kernel(T* device_input, T* device_output, const int valid) +auto warp_reduce_valid_kernel(T* device_input, T* device_output, const int valid) -> + typename std::enable_if_t> { // Minimum size is 1 constexpr unsigned int warps_no = test_utils::max(BlockSize / LogicalWarpSize, 1u); @@ -284,6 +289,16 @@ void warp_reduce_valid_kernel(T* device_input, T* device_output, const int valid } } +template +__global__ +__launch_bounds__(BlockSize) +auto warp_reduce_valid_kernel(T* /*device_input*/, T* /*device_output*/, const int /*valid*/) -> + typename std::enable_if_t> +{ + // This kernel should never be actually called; tests are filtered out at runtime + // if the device does not support the LogicalWarpSize +} + TYPED_TEST(HipcubWarpReduceTests, ReduceValid) { int device_id = test_common_utils::obtain_device_from_ctest(); @@ -407,15 +422,11 @@ TYPED_TEST(HipcubWarpReduceTests, ReduceValid) } } -template< - class T, - class Flag, - unsigned int BlockSize, - unsigned int LogicalWarpSize -> +template __global__ __launch_bounds__(BlockSize) -void head_segmented_warp_reduce_kernel(T* input, Flag* flags, T* output) +auto head_segmented_warp_reduce_kernel(T* input, Flag* flags, T* output) -> + typename std::enable_if_t> { // Minimum size is 1 constexpr unsigned int warps_no = test_utils::max(BlockSize / LogicalWarpSize, 1u); @@ -432,6 +443,16 @@ void head_segmented_warp_reduce_kernel(T* input, Flag* flags, T* output) output[index] = value; } +template +__global__ +__launch_bounds__(BlockSize) +auto head_segmented_warp_reduce_kernel(T* /*input*/, Flag* /*flags*/, T* /*output*/) -> + typename std::enable_if_t> +{ + // This kernel should never be actually called; tests are filtered out at runtime + // if the device does not support the LogicalWarpSize +} + TYPED_TEST(HipcubWarpReduceTests, HeadSegmentedReduceSum) { int device_id = test_common_utils::obtain_device_from_ctest(); @@ -610,15 +631,11 @@ TYPED_TEST(HipcubWarpReduceTests, HeadSegmentedReduceSum) } } -template< - class T, - class Flag, - unsigned int BlockSize, - unsigned int LogicalWarpSize -> +template __global__ __launch_bounds__(BlockSize) -void tail_segmented_warp_reduce_kernel(T* input, Flag* flags, T* output) +auto tail_segmented_warp_reduce_kernel(T* input, Flag* flags, T* output) -> + typename std::enable_if_t> { // Minimum size is 1 constexpr unsigned int warps_no = test_utils::max(BlockSize / LogicalWarpSize, 1u); @@ -636,6 +653,16 @@ void tail_segmented_warp_reduce_kernel(T* input, Flag* flags, T* output) output[index] = value; } +template +__global__ +__launch_bounds__(BlockSize) +auto tail_segmented_warp_reduce_kernel(T* /*input*/, Flag* /*flags*/, T* /*output*/) -> + typename std::enable_if_t> +{ + // This kernel should never be actually called; tests are filtered out at runtime + // if the device does not support the LogicalWarpSize +} + TYPED_TEST(HipcubWarpReduceTests, TailSegmentedReduceSum) { int device_id = test_common_utils::obtain_device_from_ctest(); diff --git a/test/hipcub/test_hipcub_warp_scan.cpp b/test/hipcub/test_hipcub_warp_scan.cpp index ee5e41ee..2209d88f 100644 --- a/test/hipcub/test_hipcub_warp_scan.cpp +++ b/test/hipcub/test_hipcub_warp_scan.cpp @@ -23,6 +23,7 @@ #include "common_test_header.hpp" #include "hipcub/warp/warp_scan.hpp" +#include // Params for tests template< @@ -45,7 +46,6 @@ class HipcubWarpScanTests : public ::testing::Test { using type = typename Params::type; static constexpr unsigned int warp_size = Params::warp_size; }; - typedef ::testing::Types< // shuffle based scan @@ -108,14 +108,10 @@ typedef ::testing::Types< TYPED_TEST_SUITE(HipcubWarpScanTests, HipcubWarpScanTestParams); -template< - class T, - unsigned int BlockSize, - unsigned int LogicalWarpSize -> -__global__ -__launch_bounds__(BlockSize) -void warp_inclusive_scan_kernel(T* device_input, T* device_output) +template +__global__ __launch_bounds__(BlockSize) +auto warp_inclusive_scan_kernel(T* device_input, T* device_output) + -> std::enable_if_t> { // Minimum size is 1 constexpr unsigned int warps_no = test_utils::max(BlockSize / LogicalWarpSize, 1u); @@ -132,6 +128,12 @@ void warp_inclusive_scan_kernel(T* device_input, T* device_output) device_output[index] = value; } +template +__global__ __launch_bounds__(BlockSize) +auto warp_inclusive_scan_kernel(T* /*device_input*/, T* /*device_output*/) + -> std::enable_if_t> +{} + TYPED_TEST(HipcubWarpScanTests, InclusiveScan) { int device_id = test_common_utils::obtain_device_from_ctest(); @@ -256,17 +258,12 @@ TYPED_TEST(HipcubWarpScanTests, InclusiveScan) } } -template< - class T, - unsigned int BlockSize, - unsigned int LogicalWarpSize -> -__global__ -__launch_bounds__(BlockSize) -void warp_inclusive_scan_reduce_kernel( - T* device_input, - T* device_output, - T* device_output_reductions) +template +__global__ __launch_bounds__(BlockSize) +auto warp_inclusive_scan_reduce_kernel(T* device_input, + T* device_output, + T* device_output_reductions) + -> std::enable_if_t> { // Minimum size is 1 constexpr unsigned int warps_no = test_utils::max(BlockSize / LogicalWarpSize, 1u); @@ -295,6 +292,14 @@ void warp_inclusive_scan_reduce_kernel( } } +template +__global__ __launch_bounds__(BlockSize) +auto warp_inclusive_scan_reduce_kernel(T* /*device_input*/, + T* /*device_output*/, + T* /*device_output_reductions*/) + -> std::enable_if_t> +{} + TYPED_TEST(HipcubWarpScanTests, InclusiveScanReduce) { int device_id = test_common_utils::obtain_device_from_ctest(); @@ -438,14 +443,10 @@ TYPED_TEST(HipcubWarpScanTests, InclusiveScanReduce) } } -template< - class T, - unsigned int BlockSize, - unsigned int LogicalWarpSize -> -__global__ -__launch_bounds__(BlockSize) -void warp_exclusive_scan_kernel(T* device_input, T* device_output, T init) +template +__global__ __launch_bounds__(BlockSize) +auto warp_exclusive_scan_kernel(T* device_input, T* device_output, T init) + -> std::enable_if_t> { // Minimum size is 1 constexpr unsigned int warps_no = test_utils::max(BlockSize / LogicalWarpSize, 1u); @@ -462,6 +463,12 @@ void warp_exclusive_scan_kernel(T* device_input, T* device_output, T init) device_output[index] = value; } +template +__global__ __launch_bounds__(BlockSize) +auto warp_exclusive_scan_kernel(T* /*device_input*/, T* /*device_output*/, T /*init*/) + -> std::enable_if_t> +{} + TYPED_TEST(HipcubWarpScanTests, ExclusiveScan) { int device_id = test_common_utils::obtain_device_from_ctest(); @@ -589,18 +596,13 @@ TYPED_TEST(HipcubWarpScanTests, ExclusiveScan) } } -template< - class T, - unsigned int BlockSize, - unsigned int LogicalWarpSize -> -__global__ -__launch_bounds__(BlockSize) -void warp_exclusive_scan_reduce_kernel( - T* device_input, - T* device_output, - T* device_output_reductions, - T init) +template +__global__ __launch_bounds__(BlockSize) +auto warp_exclusive_scan_reduce_kernel(T* device_input, + T* device_output, + T* device_output_reductions, + T init) + -> std::enable_if_t> { // Minimum size is 1 constexpr unsigned int warps_no = test_utils::max(BlockSize / LogicalWarpSize, 1u); @@ -622,6 +624,15 @@ void warp_exclusive_scan_reduce_kernel( } } +template +__global__ __launch_bounds__(BlockSize) +auto warp_exclusive_scan_reduce_kernel(T* /*device_input*/, + T* /*device_output*/, + T* /*device_output_reductions*/, + T /*init*/) + -> std::enable_if_t> +{} + TYPED_TEST(HipcubWarpScanTests, ExclusiveReduceScan) { int device_id = test_common_utils::obtain_device_from_ctest(); @@ -778,18 +789,13 @@ TYPED_TEST(HipcubWarpScanTests, ExclusiveReduceScan) } } -template< - class T, - unsigned int BlockSize, - unsigned int LogicalWarpSize -> -__global__ -__launch_bounds__(BlockSize) -void warp_scan_kernel( - T* device_input, - T* device_inclusive_output, - T* device_exclusive_output, - T init) +template +__global__ __launch_bounds__(BlockSize) +auto warp_scan_kernel(T* device_input, + T* device_inclusive_output, + T* device_exclusive_output, + T init) + -> std::enable_if_t> { // Minimum size is 1 constexpr unsigned int warps_no = test_utils::max(BlockSize / LogicalWarpSize, 1u); @@ -808,6 +814,15 @@ void warp_scan_kernel( device_exclusive_output[index] = exclusive_output; } +template +__global__ __launch_bounds__(BlockSize) +auto warp_scan_kernel(T* /*device_input*/, + T* /*device_inclusive_output*/, + T* /*device_exclusive_output*/, + T /*init*/) + -> std::enable_if_t> +{} + TYPED_TEST(HipcubWarpScanTests, Scan) { int device_id = test_common_utils::obtain_device_from_ctest();