diff --git a/.gitignore b/.gitignore index fcbe0fa3a..17258e3de 100644 --- a/.gitignore +++ b/.gitignore @@ -79,3 +79,6 @@ cagra_index ivf_flat_index ivf_pq_index +# cuvs_bench +datasets/ +/*.json \ No newline at end of file diff --git a/build.sh b/build.sh index a283bcd07..b463f0f0d 100755 --- a/build.sh +++ b/build.sh @@ -275,7 +275,7 @@ if hasArg tests || (( ${NUMARGS} == 0 )); then fi if hasArg bench-ann || (( ${NUMARGS} == 0 )); then - BUILD_ANN_BENCH=ON + BUILD_CUVS_BENCH=ON CMAKE_TARGET="${CMAKE_TARGET};${ANN_BENCH_TARGETS}" fi @@ -351,7 +351,7 @@ if (( ${NUMARGS} == 0 )) || hasArg libcuvs || hasArg docs || hasArg tests || has -DDISABLE_DEPRECATION_WARNINGS=${DISABLE_DEPRECATION_WARNINGS} \ -DBUILD_TESTS=${BUILD_TESTS} \ -DBUILD_C_TESTS=${BUILD_TESTS} \ - -DBUILD_ANN_BENCH=${BUILD_ANN_BENCH} \ + -DBUILD_CUVS_BENCH=${BUILD_CUVS_BENCH} \ -DBUILD_CPU_ONLY=${BUILD_CPU_ONLY} \ -DCMAKE_MESSAGE_LOG_LEVEL=${CMAKE_LOG_LEVEL} \ ${CACHE_ARGS} \ @@ -419,6 +419,11 @@ if (( ${NUMARGS} == 0 )) || hasArg python; then python -m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true ${REPODIR}/python/cuvs fi +# Build and (optionally) install the cuvs_bench Python package +if (( ${NUMARGS} == 0 )) || hasArg bench-ann; then + python -m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true ${REPODIR}/python/cuvs_bench +fi + # Build the cuvs Rust bindings if (( ${NUMARGS} == 0 )) || hasArg rust; then cd ${REPODIR}/rust diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml index 7e1014f25..73c42ca71 100644 --- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml +++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml @@ -42,5 +42,8 @@ dependencies: - pandas - pylibraft==24.10.*,>=0.0.0a0 - pyyaml +- rapids-build-backend>=0.3.0,<0.4.0.dev0 +- setuptools - sysroot_linux-aarch64==2.17 +- wheel name: bench_ann_cuda-118_arch-aarch64 diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml index 120b7afca..473e50bc6 100644 --- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml @@ -42,5 +42,8 @@ dependencies: - pandas - pylibraft==24.10.*,>=0.0.0a0 - pyyaml +- rapids-build-backend>=0.3.0,<0.4.0.dev0 +- setuptools - sysroot_linux-64==2.17 +- wheel name: bench_ann_cuda-118_arch-x86_64 diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml index ac0ea97e6..8a877c4c0 100644 --- a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml +++ b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml @@ -38,5 +38,8 @@ dependencies: - pandas - pylibraft==24.10.*,>=0.0.0a0 - pyyaml +- rapids-build-backend>=0.3.0,<0.4.0.dev0 +- setuptools - sysroot_linux-aarch64==2.17 +- wheel name: bench_ann_cuda-125_arch-aarch64 diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml index e593c240d..54859a77f 100644 --- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml @@ -38,5 +38,8 @@ dependencies: - pandas - pylibraft==24.10.*,>=0.0.0a0 - pyyaml +- rapids-build-backend>=0.3.0,<0.4.0.dev0 +- setuptools - sysroot_linux-64==2.17 +- wheel name: bench_ann_cuda-125_arch-x86_64 diff --git a/conda/recipes/libcuvs/build_libcuvs_tests.sh b/conda/recipes/libcuvs/build_libcuvs_tests.sh index 5d77ae2d1..b077dbe60 100644 --- a/conda/recipes/libcuvs/build_libcuvs_tests.sh +++ b/conda/recipes/libcuvs/build_libcuvs_tests.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash # Copyright (c) 2022-2024, NVIDIA CORPORATION. -./build.sh tests bench-ann --allgpuarch --no-nvtx --build-metrics=tests_bench --incl-cache-stats +./build.sh tests --allgpuarch --no-nvtx --build-metrics=tests --incl-cache-stats cmake --install cpp/build --component testing diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d8d554648..b72d7f165 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -55,7 +55,7 @@ option(BUILD_SHARED_LIBS "Build cuvs shared libraries" ON) option(BUILD_TESTS "Build cuvs unit-tests" ON) option(BUILD_C_LIBRARY "Build cuVS C API library" OFF) option(BUILD_C_TESTS "Build cuVS C API tests" OFF) -option(BUILD_ANN_BENCH "Build cuVS ann benchmarks" OFF) +option(BUILD_CUVS_BENCH "Build cuVS ann benchmarks" OFF) option(BUILD_CAGRA_HNSWLIB "Build CAGRA+hnswlib interface" ON) option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF) option(CUDA_ENABLE_LINEINFO @@ -96,7 +96,7 @@ include(CMakeDependentOption) message(VERBOSE "cuVS: Build cuVS unit-tests: ${BUILD_TESTS}") message(VERBOSE "cuVS: Build CPU only components: ${BUILD_CPU_ONLY}") -message(VERBOSE "cuVS: Build ANN benchmarks: ${BUILD_ANN_BENCH}") +message(VERBOSE "cuVS: Build ANN benchmarks: ${BUILD_CUVS_BENCH}") message(VERBOSE "cuVS: Enable detection of conda environment for dependencies: ${DETECT_CONDA_ENV}") message(VERBOSE "cuVS: Disable depreaction warnings " ${DISABLE_DEPRECATION_WARNINGS}) message(VERBOSE "cuVS: Disable OpenMP: ${DISABLE_OPENMP}") @@ -188,7 +188,7 @@ endif() include(cmake/thirdparty/get_cutlass.cmake) -if(BUILD_ANN_BENCH) +if(BUILD_CUVS_BENCH) include(${rapids-cmake-dir}/cpm/gbench.cmake) rapids_cpm_gbench(BUILD_STATIC) endif() @@ -651,6 +651,6 @@ endif() # ################################################################################################## # * build ann benchmark executable ----------------------------------------------- -if(BUILD_ANN_BENCH) +if(BUILD_CUVS_BENCH) add_subdirectory(bench/ann/) endif() diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt index 6fe23483e..3224587e4 100644 --- a/cpp/bench/ann/CMakeLists.txt +++ b/cpp/bench/ann/CMakeLists.txt @@ -199,30 +199,19 @@ if(NOT TARGET CUVS_ANN_BENCH_ALL) endif() if(CUVS_ANN_BENCH_USE_HNSWLIB) - ConfigureAnnBench( - NAME HNSWLIB PATH src/hnswlib/hnswlib_benchmark.cpp LINKS hnswlib::hnswlib - ) + ConfigureAnnBench(NAME HNSWLIB PATH src/hnswlib/hnswlib_benchmark.cpp LINKS hnswlib::hnswlib) endif() if(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ) ConfigureAnnBench( - NAME CUVS_IVF_PQ - PATH - src/cuvs/cuvs_benchmark.cu - src/cuvs/cuvs_ivf_pq.cu - LINKS cuvs + NAME CUVS_IVF_PQ PATH src/cuvs/cuvs_benchmark.cu src/cuvs/cuvs_ivf_pq.cu LINKS cuvs ) endif() if(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT) ConfigureAnnBench( - NAME CUVS_IVF_FLAT - PATH - src/cuvs/cuvs_benchmark.cu - src/cuvs/cuvs_ivf_flat.cu - LINKS - cuvs + NAME CUVS_IVF_FLAT PATH src/cuvs/cuvs_benchmark.cu src/cuvs/cuvs_ivf_flat.cu LINKS cuvs ) endif() @@ -232,12 +221,8 @@ endif() if(CUVS_KNN_BENCH_USE_CUVS_BRUTE_FORCE) ConfigureAnnBench( - NAME - CUVS_KNN_BRUTE_FORCE - PATH - $<$:src/cuvs/cuvs_brute_force_knn.cu> - LINKS - cuvs + NAME CUVS_KNN_BRUTE_FORCE PATH + $<$:src/cuvs/cuvs_brute_force_knn.cu> LINKS cuvs ) endif() @@ -258,8 +243,7 @@ endif() if(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB) ConfigureAnnBench( - NAME CUVS_CAGRA_HNSWLIB PATH src/cuvs/cuvs_cagra_hnswlib.cu LINKS cuvs - hnswlib::hnswlib + NAME CUVS_CAGRA_HNSWLIB PATH src/cuvs/cuvs_cagra_hnswlib.cu LINKS cuvs hnswlib::hnswlib ) endif() @@ -267,36 +251,31 @@ message("CUVS_FAISS_TARGETS: ${CUVS_FAISS_TARGETS}") message("CUDAToolkit_LIBRARY_DIR: ${CUDAToolkit_LIBRARY_DIR}") if(CUVS_ANN_BENCH_USE_FAISS_CPU_FLAT) ConfigureAnnBench( - NAME FAISS_CPU_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS - ${CUVS_FAISS_TARGETS} + NAME FAISS_CPU_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS ${CUVS_FAISS_TARGETS} ) endif() if(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT) ConfigureAnnBench( - NAME FAISS_CPU_IVF_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS - ${CUVS_FAISS_TARGETS} + NAME FAISS_CPU_IVF_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS ${CUVS_FAISS_TARGETS} ) endif() if(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_PQ) ConfigureAnnBench( - NAME FAISS_CPU_IVF_PQ PATH src/faiss/faiss_cpu_benchmark.cpp LINKS - ${CUVS_FAISS_TARGETS} + NAME FAISS_CPU_IVF_PQ PATH src/faiss/faiss_cpu_benchmark.cpp LINKS ${CUVS_FAISS_TARGETS} ) endif() if(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT AND CUVS_FAISS_ENABLE_GPU) ConfigureAnnBench( - NAME FAISS_GPU_IVF_FLAT PATH src/faiss/faiss_gpu_benchmark.cu LINKS - ${CUVS_FAISS_TARGETS} + NAME FAISS_GPU_IVF_FLAT PATH src/faiss/faiss_gpu_benchmark.cu LINKS ${CUVS_FAISS_TARGETS} ) endif() if(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_PQ AND CUVS_FAISS_ENABLE_GPU) ConfigureAnnBench( - NAME FAISS_GPU_IVF_PQ PATH src/faiss/faiss_gpu_benchmark.cu LINKS - ${CUVS_FAISS_TARGETS} + NAME FAISS_GPU_IVF_PQ PATH src/faiss/faiss_gpu_benchmark.cu LINKS ${CUVS_FAISS_TARGETS} ) endif() @@ -322,13 +301,8 @@ if(CUVS_ANN_BENCH_SINGLE_EXE) target_link_libraries( ANN_BENCH - PRIVATE raft::raft - nlohmann_json::nlohmann_json - benchmark::benchmark - dl - fmt::fmt-header-only - spdlog::spdlog_header_only - $<$:CUDA::nvtx3> + PRIVATE raft::raft nlohmann_json::nlohmann_json benchmark::benchmark dl fmt::fmt-header-only + spdlog::spdlog_header_only $<$:CUDA::nvtx3> ) set_target_properties( ANN_BENCH diff --git a/dependencies.yaml b/dependencies.yaml index c63cecbbe..c18f53305 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -38,6 +38,7 @@ files: - develop - bench - bench_python + - rapids_build_setuptools test_cpp: output: none includes: @@ -115,6 +116,13 @@ files: table: build-system includes: - rapids_build_setuptools + py_rapids_build_py_cuvs_bench: + output: pyproject + pyproject_dir: python/cuvs_bench + extras: + table: tool.rapids-build-backend + key: requires + includes: [] py_run_cuvs_bench: output: pyproject pyproject_dir: python/cuvs_bench @@ -187,7 +195,7 @@ dependencies: rapids_build_setuptools: common: - - output_types: [requirements, pyproject] + - output_types: [conda, requirements, pyproject] packages: - &rapids_build_backend rapids-build-backend>=0.3.0,<0.4.0.dev0 - setuptools @@ -469,13 +477,12 @@ dependencies: - openblas bench_python: common: - - output_types: [conda] + - output_types: [conda, pyproject, requirements] packages: + - click - matplotlib - pandas - pyyaml - - pandas - - click depends_on_librmm: common: - output_types: conda diff --git a/python/cuvs/CMakeLists.txt b/python/cuvs/CMakeLists.txt index 7d2f8dcf9..feb3bd58c 100644 --- a/python/cuvs/CMakeLists.txt +++ b/python/cuvs/CMakeLists.txt @@ -83,14 +83,22 @@ if(NOT cuvs_FOUND) if(NOT CUDA_STATIC_MATH_LIBRARIES AND USE_CUDA_MATH_WHEELS) set(rpaths - "$ORIGIN/../nvidia/cublas/lib" - "$ORIGIN/../nvidia/curand/lib" - "$ORIGIN/../nvidia/cusolver/lib" - "$ORIGIN/../nvidia/cusparse/lib" - "$ORIGIN/../nvidia/nvjitlink/lib" + "$ORIGIN/../nvidia/cublas/lib" + "$ORIGIN/../nvidia/curand/lib" + "$ORIGIN/../nvidia/cusolver/lib" + "$ORIGIN/../nvidia/cusparse/lib" + "$ORIGIN/../nvidia/nvjitlink/lib" + ) + set_property( + TARGET cuvs + PROPERTY INSTALL_RPATH ${rpaths} + APPEND + ) + set_property( + TARGET cuvs_c + PROPERTY INSTALL_RPATH ${rpaths} + APPEND ) - set_property(TARGET cuvs PROPERTY INSTALL_RPATH ${rpaths} APPEND) - set_property(TARGET cuvs_c PROPERTY INSTALL_RPATH ${rpaths} APPEND) endif() set(cython_lib_dir cuvs) diff --git a/python/cuvs/cuvs/neighbors/filters/CMakeLists.txt b/python/cuvs/cuvs/neighbors/filters/CMakeLists.txt index 8f281d1c8..c90615feb 100644 --- a/python/cuvs/cuvs/neighbors/filters/CMakeLists.txt +++ b/python/cuvs/cuvs/neighbors/filters/CMakeLists.txt @@ -20,6 +20,5 @@ set(linked_libraries cuvs::cuvs cuvs::c_api) rapids_cython_create_modules( CXX SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX - neighbors_prefilter_ + LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX neighbors_prefilter_ ) diff --git a/python/cuvs_bench/cuvs_bench/config/__init__.py b/python/cuvs_bench/cuvs_bench/config/__init__.py new file mode 100644 index 000000000..7c04e3fd8 --- /dev/null +++ b/python/cuvs_bench/cuvs_bench/config/__init__.py @@ -0,0 +1,17 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from .algos.constraints import * diff --git a/python/cuvs_bench/cuvs_bench/config/algorithms.yaml b/python/cuvs_bench/cuvs_bench/config/algorithms.yaml new file mode 100644 index 000000000..dc1127fbc --- /dev/null +++ b/python/cuvs_bench/cuvs_bench/config/algorithms.yaml @@ -0,0 +1,42 @@ +faiss_gpu_flat: + executable: FAISS_GPU_FLAT_ANN_BENCH + requires_gpu: true +faiss_gpu_ivf_flat: + executable: FAISS_GPU_IVF_FLAT_ANN_BENCH + requires_gpu: true +faiss_gpu_ivf_pq: + executable: FAISS_GPU_IVF_PQ_ANN_BENCH + requires_gpu: true +faiss_gpu_ivf_sq: + executable: FAISS_GPU_IVF_PQ_ANN_BENCH + requires_gpu: true +faiss_cpu_flat: + executable: FAISS_CPU_FLAT_ANN_BENCH + requires_gpu: false +faiss_cpu_ivf_flat: + executable: FAISS_CPU_IVF_FLAT_ANN_BENCH + requires_gpu: false +faiss_cpu_ivf_pq: + executable: FAISS_CPU_IVF_PQ_ANN_BENCH + requires_gpu: false +cuvs_ivf_flat: + executable: CUVS_IVF_FLAT_ANN_BENCH + requires_gpu: true +cuvs_ivf_pq: + executable: CUVS_IVF_PQ_ANN_BENCH + requires_gpu: true +cuvs_cagra: + executable: CUVS_CAGRA_ANN_BENCH + requires_gpu: true +cuvs_brute_force: + executable: CUVS_BRUTE_FORCE_ANN_BENCH + requires_gpu: true +ggnn: + executable: GGNN_ANN_BENCH + requires_gpu: true +hnswlib: + executable: HNSWLIB_ANN_BENCH + requires_gpu: false +cuvs_cagra_hnswlib: + executable: CUVS_CAGRA_HNSWLIB_ANN_BENCH + requires_gpu: true diff --git a/python/cuvs_bench/cuvs_bench/config/algos/__init__.py b/python/cuvs_bench/cuvs_bench/config/algos/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/cuvs_bench/cuvs_bench/config/constraints.py b/python/cuvs_bench/cuvs_bench/config/algos/constraints/__init__.py similarity index 98% rename from python/cuvs_bench/cuvs_bench/config/constraints.py rename to python/cuvs_bench/cuvs_bench/config/algos/constraints/__init__.py index ff451c056..de05bd752 100644 --- a/python/cuvs_bench/cuvs_bench/config/constraints.py +++ b/python/cuvs_bench/cuvs_bench/config/algos/constraints/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra.yaml b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra.yaml index e7b049d0c..edacb25b5 100644 --- a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra.yaml +++ b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra.yaml @@ -1,11 +1,11 @@ name: cuvs_cagra constraints: - build: cuvs_bench.constraints.raft_cagra_build_constraints - search: cuvs_bench.constraints.raft_cagra_search_constraints + build: cuvs_bench.config.algos.constraints.cuvs_cagra_build + search: cuvs_bench.config.algos.constraints.cuvs_cagra_search groups: base: build: - graph_degree: [32, 64, 128, 256] + graph_degree: [32, 64, 96, 128] intermediate_graph_degree: [32, 64, 96, 128] graph_build_algo: ["NN_DESCENT"] search: diff --git a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml index 70e344dfd..f1a7f272c 100644 --- a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml +++ b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml @@ -1,6 +1,6 @@ name: cuvs_cagra_hnswlib constraints: - search: cuvs_bench.constraints.hnswlib_search + search: cuvs_bench.config.algos.constraints.hnswlib_search groups: base: build: diff --git a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_pq.yaml b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_pq.yaml index aa95d6716..d68e7973a 100644 --- a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_pq.yaml +++ b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_pq.yaml @@ -1,7 +1,7 @@ name: cuvs_ivf_pq constraints: - build: cuvs_bench.constraints.cuvs_ivf_pq_build - search: cuvs_bench.constraints.cuvs_ivf_pq_search + build: cuvs_bench.config.algos.constraints.cuvs_ivf_pq_build + search: cuvs_bench.config.algos.constraints.cuvs_ivf_pq_search groups: base: build: diff --git a/python/cuvs_bench/cuvs_bench/config/algos/faiss_gpu_ivf_pq.yaml b/python/cuvs_bench/cuvs_bench/config/algos/faiss_gpu_ivf_pq.yaml index 1bd78b736..782f3aed1 100644 --- a/python/cuvs_bench/cuvs_bench/config/algos/faiss_gpu_ivf_pq.yaml +++ b/python/cuvs_bench/cuvs_bench/config/algos/faiss_gpu_ivf_pq.yaml @@ -1,7 +1,7 @@ name: faiss_gpu_ivf_pq constraints: - build: cuvs_bench.constraints.faiss_gpu_ivf_pq_build - search: cuvs_bench.constraints.faiss_gpu_ivf_pq_search + build: cuvs_bench.config.algos.constraints.faiss_gpu_ivf_pq_build + search: cuvs_bench.config.algos.constraints.faiss_gpu_ivf_pq_search groups: base: build: diff --git a/python/cuvs_bench/cuvs_bench/config/algos/hnswlib.yaml b/python/cuvs_bench/cuvs_bench/config/algos/hnswlib.yaml index dbd73155d..93d8cff2d 100644 --- a/python/cuvs_bench/cuvs_bench/config/algos/hnswlib.yaml +++ b/python/cuvs_bench/cuvs_bench/config/algos/hnswlib.yaml @@ -1,6 +1,6 @@ name: hnswlib constraints: - search: cuvs_bench.constraints.hnswlib_search + search: cuvs_bench.config.algos.constraints.hnswlib_search groups: base: build: diff --git a/python/cuvs_bench/cuvs_bench/config/bigann-100M.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/bigann-100M.yaml similarity index 100% rename from python/cuvs_bench/cuvs_bench/config/bigann-100M.yaml rename to python/cuvs_bench/cuvs_bench/config/datasets/bigann-100M.yaml diff --git a/python/cuvs_bench/cuvs_bench/config/datasets.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/datasets.yaml similarity index 100% rename from python/cuvs_bench/cuvs_bench/config/datasets.yaml rename to python/cuvs_bench/cuvs_bench/config/datasets/datasets.yaml diff --git a/python/cuvs_bench/cuvs_bench/config/deep-100M.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/deep-100M.yaml similarity index 100% rename from python/cuvs_bench/cuvs_bench/config/deep-100M.yaml rename to python/cuvs_bench/cuvs_bench/config/datasets/deep-100M.yaml diff --git a/python/cuvs_bench/cuvs_bench/config/deep-1B.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/deep-1B.yaml similarity index 100% rename from python/cuvs_bench/cuvs_bench/config/deep-1B.yaml rename to python/cuvs_bench/cuvs_bench/config/datasets/deep-1B.yaml diff --git a/python/cuvs_bench/cuvs_bench/config/deep-image-96-inner.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/deep-image-96-inner.yaml similarity index 100% rename from python/cuvs_bench/cuvs_bench/config/deep-image-96-inner.yaml rename to python/cuvs_bench/cuvs_bench/config/datasets/deep-image-96-inner.yaml diff --git a/python/cuvs_bench/cuvs_bench/config/fashion-mnist-784-euclidean.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/fashion-mnist-784-euclidean.yaml similarity index 100% rename from python/cuvs_bench/cuvs_bench/config/fashion-mnist-784-euclidean.yaml rename to python/cuvs_bench/cuvs_bench/config/datasets/fashion-mnist-784-euclidean.yaml diff --git a/python/cuvs_bench/cuvs_bench/config/gist-960-euclidean.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/gist-960-euclidean.yaml similarity index 100% rename from python/cuvs_bench/cuvs_bench/config/gist-960-euclidean.yaml rename to python/cuvs_bench/cuvs_bench/config/datasets/gist-960-euclidean.yaml diff --git a/python/cuvs_bench/cuvs_bench/config/glove-100-angular.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/glove-100-angular.yaml similarity index 100% rename from python/cuvs_bench/cuvs_bench/config/glove-100-angular.yaml rename to python/cuvs_bench/cuvs_bench/config/datasets/glove-100-angular.yaml diff --git a/python/cuvs_bench/cuvs_bench/config/glove-100-inner.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/glove-100-inner.yaml similarity index 100% rename from python/cuvs_bench/cuvs_bench/config/glove-100-inner.yaml rename to python/cuvs_bench/cuvs_bench/config/datasets/glove-100-inner.yaml diff --git a/python/cuvs_bench/cuvs_bench/config/glove-50-angular.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/glove-50-angular.yaml similarity index 100% rename from python/cuvs_bench/cuvs_bench/config/glove-50-angular.yaml rename to python/cuvs_bench/cuvs_bench/config/datasets/glove-50-angular.yaml diff --git a/python/cuvs_bench/cuvs_bench/config/glove-50-inner.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/glove-50-inner.yaml similarity index 100% rename from python/cuvs_bench/cuvs_bench/config/glove-50-inner.yaml rename to python/cuvs_bench/cuvs_bench/config/datasets/glove-50-inner.yaml diff --git a/python/cuvs_bench/cuvs_bench/config/lastfm-65-angular.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/lastfm-65-angular.yaml similarity index 100% rename from python/cuvs_bench/cuvs_bench/config/lastfm-65-angular.yaml rename to python/cuvs_bench/cuvs_bench/config/datasets/lastfm-65-angular.yaml diff --git a/python/cuvs_bench/cuvs_bench/config/mnist-784-euclidean.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/mnist-784-euclidean.yaml similarity index 100% rename from python/cuvs_bench/cuvs_bench/config/mnist-784-euclidean.yaml rename to python/cuvs_bench/cuvs_bench/config/datasets/mnist-784-euclidean.yaml diff --git a/python/cuvs_bench/cuvs_bench/config/nytimes-256-angular.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/nytimes-256-angular.yaml similarity index 100% rename from python/cuvs_bench/cuvs_bench/config/nytimes-256-angular.yaml rename to python/cuvs_bench/cuvs_bench/config/datasets/nytimes-256-angular.yaml diff --git a/python/cuvs_bench/cuvs_bench/config/nytimes-256-inner.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/nytimes-256-inner.yaml similarity index 100% rename from python/cuvs_bench/cuvs_bench/config/nytimes-256-inner.yaml rename to python/cuvs_bench/cuvs_bench/config/datasets/nytimes-256-inner.yaml diff --git a/python/cuvs_bench/cuvs_bench/config/sift-128-euclidean.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/sift-128-euclidean.yaml similarity index 100% rename from python/cuvs_bench/cuvs_bench/config/sift-128-euclidean.yaml rename to python/cuvs_bench/cuvs_bench/config/datasets/sift-128-euclidean.yaml diff --git a/python/cuvs_bench/cuvs_bench/config/wiki_all_10M.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/wiki_all_10M.yaml similarity index 100% rename from python/cuvs_bench/cuvs_bench/config/wiki_all_10M.yaml rename to python/cuvs_bench/cuvs_bench/config/datasets/wiki_all_10M.yaml diff --git a/python/cuvs_bench/cuvs_bench/config/wiki_all_1M.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/wiki_all_1M.yaml similarity index 100% rename from python/cuvs_bench/cuvs_bench/config/wiki_all_1M.yaml rename to python/cuvs_bench/cuvs_bench/config/datasets/wiki_all_1M.yaml diff --git a/python/cuvs_bench/cuvs_bench/config/wiki_all_88M.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/wiki_all_88M.yaml similarity index 100% rename from python/cuvs_bench/cuvs_bench/config/wiki_all_88M.yaml rename to python/cuvs_bench/cuvs_bench/config/datasets/wiki_all_88M.yaml diff --git a/python/cuvs_bench/cuvs_bench/run/__init__.py b/python/cuvs_bench/cuvs_bench/run/__init__.py new file mode 100644 index 000000000..7cb04e6f8 --- /dev/null +++ b/python/cuvs_bench/cuvs_bench/run/__init__.py @@ -0,0 +1,17 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from .run import run_benchmark diff --git a/python/cuvs_bench/cuvs_bench/run/__main__.py b/python/cuvs_bench/cuvs_bench/run/__main__.py new file mode 100644 index 000000000..b5d99a4bf --- /dev/null +++ b/python/cuvs_bench/cuvs_bench/run/__main__.py @@ -0,0 +1,216 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +from pathlib import Path +from typing import Optional + +import click +from run import run_benchmark + + +@click.command() +@click.option( + "--subset-size", + type=click.IntRange(min=1), + help="The number of subset rows of the dataset to build the index", +) +@click.option( + "-k", + "--count", + default=10, + show_default=True, + type=click.IntRange(min=1), + prompt="Enter the number of neighbors to search for", + help="The number of nearest neighbors to search for", +) +@click.option( + "-bs", + "--batch-size", + default=10000, + show_default=True, + type=click.IntRange(min=1), + prompt="Enter the batch size", + help="Number of query vectors to use in each query trial", +) +@click.option( + "--dataset-configuration", + default=None, + show_default=True, + help="Path to YAML configuration file for datasets", +) +@click.option( + "--configuration", + help="Path to YAML configuration file or directory for algorithms. " + "Any run groups found in the specified file/directory will " + "automatically override groups of the same name present in the " + "default configurations, including `base`.", +) +@click.option( + "--dataset", + default="glove-100-inner", + show_default=True, + prompt="Enter the name of dataset", + help="Name of dataset", +) +@click.option( + "--dataset-path", + default=lambda: os.environ.get( + "RAPIDS_DATASET_ROOT_DIR", + os.path.join(Path(__file__).parent, "datasets/"), + ), + show_default=True, + prompt="Enter the path to dataset folder", + help="Path to dataset folder, by default will look in " + "RAPIDS_DATASET_ROOT_DIR if defined, otherwise a datasets " + "subdirectory from the calling directory.", +) +@click.option("--build", is_flag=True, help="Build the index") +@click.option("--search", is_flag=True, help="Perform the search") +@click.option( + "--algorithms", + default="cuvs_cagra", + show_default=True, + prompt="Enter the comma separated list of named algorithms to run", + help="Run only comma separated list of named algorithms. If parameters " + "`groups` and `algo-groups` are both undefined, then group `base` " + "is run by default.", +) +@click.option( + "--groups", + default="base", + show_default=True, + prompt="Enter the comma separated groups of parameters", + help="Run only comma separated groups of parameters", +) +@click.option( + "--algo-groups", + help="Add comma separated . to run. Example usage: " + ' "--algo-groups=cuvs_cagra.large,hnswlib.large".', +) +@click.option( + "-f", + "--force", + is_flag=True, + help="Re-run algorithms even if their results already exist", +) +@click.option( + "-m", + "--search-mode", + default="latency", + show_default=True, + prompt='Enter the search mode ("latency" or "throughput")', + help="Run search in 'latency' (measure individual batches) or " + "'throughput' (pipeline batches and measure end-to-end) mode.", +) +@click.option( + "-t", + "--search-threads", + default=None, + show_default=True, + help="Specify the number threads to use for throughput benchmark. " + "Single value or a pair of min and max separated by ':'. " + "Example: --search-threads=1:4. Power of 2 values between 'min' " + "and 'max' will be used. If only 'min' is specified, then a single " + "test is run with 'min' threads. By default min=1, " + "max=.", +) +@click.option( + "-r", + "--dry-run", + is_flag=True, + help="Dry-run mode will convert the yaml config for the specified " + "algorithms and datasets to the json format that’s consumed " + "by the lower-level c++ binaries and then print the command to " + "run execute the benchmarks but will not actually execute " + "the command.", +) +@click.option( + "--raft-log-level", + default="info", + show_default=True, + prompt="Enter the log level", + help="Log level, possible values are [off, error, warn, info, debug, " + "trace]. Default: 'info'. Note that 'debug' or more detailed " + "logging level requires that the library is compiled with " + "-DRAFT_ACTIVE_LEVEL= where >= .", +) +def main( + subset_size: Optional[int], + count: int, + batch_size: int, + dataset_configuration: Optional[str], + configuration: Optional[str], + dataset: str, + dataset_path: str, + build: bool, + search: bool, + algorithms: Optional[str], + groups: str, + algo_groups: Optional[str], + force: bool, + search_mode: str, + search_threads: Optional[str], + dry_run: bool, + raft_log_level: str, +) -> None: + """ + Main function to run the benchmark with the provided options. + + Parameters + ---------- + subset_size : Optional[int] + The number of subset rows of the dataset to build the index. + count : int + The number of nearest neighbors to search for. + batch_size : int + Number of query vectors to use in each query trial. + dataset_configuration : Optional[str] + Path to YAML configuration file for datasets. + configuration : Optional[str] + Path to YAML configuration file or directory for algorithms. + dataset : str + Name of the dataset to use. + dataset_path : str + Path to the dataset folder. + build : bool + Whether to build the indices. + search : bool + Whether to perform the search. + algorithms : Optional[str] + Comma-separated list of algorithm names to use. + groups : str + Comma-separated list of groups to consider. + algo_groups : Optional[str] + Comma-separated list of algorithm groups to consider. + force : bool + Whether to force the execution regardless of warnings. + search_mode : str + The mode of search to perform ('latency' or 'throughput'). + search_threads : Optional[str] + The number of threads to use for throughput benchmark. + dry_run : bool + Whether to perform a dry run without actual execution. + raft_log_level : str + The logging level for the RAFT library. + + """ + + run_benchmark(**locals()) + + +if __name__ == "__main__": + main() diff --git a/python/cuvs_bench/cuvs_bench/run/run.py b/python/cuvs_bench/cuvs_bench/run/run.py new file mode 100644 index 000000000..dbedcc183 --- /dev/null +++ b/python/cuvs_bench/cuvs_bench/run/run.py @@ -0,0 +1,685 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import itertools +import os +import warnings +from importlib import import_module +from typing import Any, Dict, Optional, Tuple + +import yaml +from runners import cuvs_bench_cpp + + +def rmm_present() -> bool: + """ + Check if RMM (RAPIDS Memory Manager) is present. + + Returns + ------- + bool + True if RMM is present, False otherwise. + """ + try: + import rmm # noqa: F401 + + return True + except ImportError: + return False + + +def load_yaml_file(file_path: str) -> dict: + """ + Load a YAML file and return its contents as a dictionary. + + Parameters + ---------- + file_path : str + The path to the YAML file. + + Returns + ------- + dict + The contents of the YAML file. + """ + with open(file_path, "r") as f: + return yaml.safe_load(f) + + +def get_dataset_configuration(dataset: str, dataset_conf_all: list) -> dict: + """ + Retrieve the configuration for a specific dataset. + + Parameters + ---------- + dataset : str + The name of the dataset to retrieve the configuration for. + dataset_conf_all : list + A list of dataset configurations. + + Returns + ------- + dict + The configuration for the specified dataset. + + Raises + ------ + ValueError + If the dataset configuration is not found. + """ + for dset in dataset_conf_all: + if dataset == dset["name"]: + return dset + raise ValueError("Could not find a dataset configuration") + + +def prepare_conf_file( + dataset_conf: dict, subset_size: Optional[int], count: int, batch_size: int +) -> dict: + """ + Prepare the main configuration file for the benchmark. + + Parameters + ---------- + dataset_conf : dict + The configuration for the dataset. + subset_size : Optional[int] + The subset size of the dataset. + count : int + The number of nearest neighbors to search for. + batch_size : int + The size of each batch for processing. + + Returns + ------- + dict + The prepared configuration file. + """ + conf_file = {"dataset": dataset_conf} + if subset_size: + conf_file["dataset"]["subset_size"] = subset_size + conf_file["search_basic_param"] = {"k": count, "batch_size": batch_size} + return conf_file + + +def gather_algorithm_configs( + scripts_path: str, configuration: Optional[str] +) -> list: + """ + Gather the list of algorithm configuration files. + + Parameters + ---------- + scripts_path : str + The path to the script directory. + configuration : Optional[str] + The path to the algorithm configuration directory or file. + + Returns + ------- + list + A list of paths to the algorithm configuration files. + """ + algos_conf_fs = os.listdir( + os.path.join(scripts_path, "../config", "algos") + ) + algos_conf_fs = [ + os.path.join(scripts_path, "../config", "algos", f) + for f in algos_conf_fs + if ".json" not in f and "constraint" not in f and ".py" not in f + ] + + if configuration: + if os.path.isdir(configuration): + algos_conf_fs += [ + os.path.join(configuration, f) + for f in os.listdir(configuration) + if ".json" not in f + ] + elif os.path.isfile(configuration): + algos_conf_fs.append(configuration) + return algos_conf_fs + + +def load_algorithms_conf( + algos_conf_fs: list, + allowed_algos: Optional[list], + allowed_algo_groups: Optional[tuple], +) -> dict: + """ + Load and filter the algorithm configurations. + + Parameters + ---------- + algos_conf_fs : list + A list of paths to algorithm configuration files. + allowed_algos : Optional[list] + A list of allowed algorithm names to filter by. + allowed_algo_groups : Optional[tuple] + A tuple of allowed algorithm groups to filter by. + + Returns + ------- + dict + A dictionary containing the loaded and filtered algorithm + configurations. + """ + algos_conf = {} + for algo_f in algos_conf_fs: + try: + algo = load_yaml_file(algo_f) + except Exception as e: + warnings.warn(f"Could not load YAML config {algo_f} due to {e}") + continue + if allowed_algos and algo["name"] not in allowed_algos: + continue + algos_conf[algo["name"]] = { + "groups": algo.get("groups", {}), + "constraints": algo.get("constraints", {}), + } + if allowed_algo_groups and algo["name"] in allowed_algo_groups[0]: + algos_conf[algo["name"]]["groups"].update( + { + group: algo["groups"][group] + for group in allowed_algo_groups[1] + if group in algo["groups"] + } + ) + return algos_conf + + +def prepare_executables( + algos_conf: dict, + algos_yaml: dict, + gpu_present: bool, + conf_file: dict, + dataset_path: str, + dataset: str, + count: int, + batch_size: int, +) -> dict: + """ + Prepare the list of executables to run based on the configurations. + + Parameters + ---------- + algos_conf : dict + The loaded algorithm configurations. + algos_yaml : dict + The global algorithms configuration. + gpu_present : bool + Whether a GPU is present. + conf_file : dict + The main configuration file. + dataset_path : str + The path to the dataset directory. + dataset : str + The name of the dataset. + count : int + The number of nearest neighbors to search for. + batch_size : int + The size of each batch for processing. + + Returns + ------- + dict + A dictionary of executables to run with their associated + configurations. + """ + executables_to_run = {} + for algo, algo_conf in algos_conf.items(): + validate_algorithm(algos_yaml, algo, gpu_present) + for group, group_conf in algo_conf["groups"].items(): + executable = find_executable( + algos_yaml, algo, group, count, batch_size + ) + if executable not in executables_to_run: + executables_to_run[executable] = {"index": []} + indexes = prepare_indexes( + group_conf, + algo, + group, + conf_file, + algos_conf, + dataset_path, + dataset, + count, + batch_size, + ) + executables_to_run[executable]["index"].extend(indexes) + return executables_to_run + + +def validate_algorithm(algos_conf: dict, algo: str, gpu_present: bool) -> bool: + """ + Validate the algorithm based on the available hardware (GPU presence). + + Parameters + ---------- + algos_conf : dict + The configuration dictionary for the algorithms. + algo : str + The name of the algorithm. + gpu_present : bool + Whether a GPU is present. + + Returns + ------- + bool + True if the algorithm is valid for the current hardware + configuration, False otherwise. + """ + algos_conf_keys = set(algos_conf.keys()) + if gpu_present: + return algo in algos_conf_keys + return ( + algo in algos_conf_keys and algos_conf[algo]["requires_gpu"] is False + ) + + +def find_executable( + algos_conf: dict, algo: str, group: str, k: int, batch_size: int +) -> Tuple[str, str, Tuple[str, str]]: + """ + Find the executable for the given algorithm and group. + + Parameters + ---------- + algos_conf : dict + The configuration dictionary for the algorithms. + algo : str + The name of the algorithm. + group : str + The name of the group. + k : int + The number of nearest neighbors to search for. + batch_size : int + The size of each batch for processing. + + Returns + ------- + Tuple[str, str, Tuple[str, str]] + A tuple containing the executable name, the path to the executable, + and the file name. + """ + executable = algos_conf[algo]["executable"] + file_name = (f"{algo},{group}", f"{algo},{group},k{k},bs{batch_size}") + build_path = get_build_path(executable) + if build_path: + return executable, build_path, file_name + raise FileNotFoundError(executable) + + +def get_build_path(executable: str) -> Optional[str]: + """ + Get the build path for the given executable. + + Parameters + ---------- + executable : str + The name of the executable. + + Returns + ------- + Optional[str] + The build path for the executable, if found. + """ + + devcontainer_path = "/home/coder/cuvs/cpp/build/latest/bench/ann" + if os.path.exists(devcontainer_path): + print(f"-- Detected devcontainer artifacts in {devcontainer_path}.") + return devcontainer_path + + build_path = os.getenv("CUVS_HOME") + if build_path: + build_path = os.path.join( + build_path, "cpp", "build", "release", executable + ) + if os.path.exists(build_path): + print(f"-- Using RAFT bench from repository in {build_path}.") + return build_path + + conda_path = os.getenv("CONDA_PREFIX") + if conda_path: + conda_executable = os.path.join(conda_path, "bin", "ann", executable) + if os.path.exists(conda_executable): + print("-- Using cuVS bench found in conda environment.") + return conda_executable + + return None + + +def prepare_indexes( + group_conf: dict, + algo: str, + group: str, + conf_file: dict, + algos_conf: dict, + dataset_path: str, + dataset: str, + count: int, + batch_size: int, +) -> list: + """ + Prepare the index configurations for the given algorithm and group. + + Parameters + ---------- + group_conf : dict + The configuration for the algorithm group. + algo : str + The name of the algorithm. + group : str + The name of the group. + conf_file : dict + The main configuration file. + dataset_path : str + The path to the dataset directory. + dataset : str + The name of the dataset. + count : int + The number of nearest neighbors to search for. + batch_size : int + The size of each batch for processing. + + Returns + ------- + list + A list of index configurations. + """ + indexes = [] + build_params = group_conf.get("build", {}) + search_params = group_conf.get("search", {}) + all_build_params = itertools.product(*build_params.values()) + search_param_names, search_param_lists = ( + zip(*search_params.items()) if search_params else ([], []) + ) + param_names = list(build_params.keys()) + for params in all_build_params: + index = { + "algo": algo, + "build_param": dict(zip(build_params.keys(), params)), + } + index_name = f"{algo}_{group}" if group != "base" else f"{algo}" + for i in range(len(params)): + index["build_param"][param_names[i]] = params[i] + index_name += "." + f"{param_names[i]}{params[i]}" + + if not validate_constraints( + algos_conf, + algo, + "build", + index["build_param"], + None, + conf_file["dataset"].get("dims"), + count, + batch_size, + ): + continue + + index_filename = ( + index_name if len(index_name) < 128 else str(hash(index_name)) + ) + index["name"] = index_name + index["file"] = os.path.join( + dataset_path, dataset, "index", index_filename + ) + index["search_params"] = validate_search_params( + itertools.product(*search_param_lists), + search_param_names, + index["build_param"], + algo, + group_conf, + algos_conf, + conf_file, + count, + batch_size, + ) + if index["search_params"]: + indexes.append(index) + return indexes + + +def validate_search_params( + all_search_params, + search_param_names, + build_params, + algo, + group_conf, + algos_conf, + conf_file, + count, + batch_size, +) -> list: + """ + Validate and prepare the search parameters for the given algorithm + and group. + + Parameters + ---------- + all_search_params : itertools.product + The Cartesian product of search parameter values. + search_param_names : list + The names of the search parameters. + algo : str + The name of the algorithm. + group_conf : dict + The configuration for the algorithm group. + conf_file : dict + The main configuration file. + count : int + The number of nearest neighbors to search for. + batch_size : int + The size of each batch for processing. + + Returns + ------- + list + A list of validated search parameters. + """ + search_params_list = [] + for search_params in all_search_params: + search_dict = dict(zip(search_param_names, search_params)) + if validate_constraints( + algos_conf, + algo, + "search", + search_dict, + build_params, + conf_file["dataset"].get("dims"), + count, + batch_size, + ): + search_params_list.append(search_dict) + return search_params_list + + +def validate_constraints( + algos_conf: Dict[str, Any], + algo: str, + constraint_type: str, + param: Dict[str, Any], + build_param: dict, + dims: Any, + k: Optional[int], + batch_size: Optional[int], +) -> bool: + """ + Validate the constraints for the given algorithm and constraint type. + + Parameters + ---------- + algos_conf : Dict[str, Any] + The configuration dictionary for the algorithms. + algo : str + The name of the algorithm. + constraint_type : str + The type of constraint to validate ('build' or 'search'). + param : Dict[str, Any] + The parameters to validate against the constraints. + dims : Any + The dimensions required for the constraints. + k : Optional[int] + The number of nearest neighbors to search for. + batch_size : Optional[int] + The size of each batch for processing. + + Returns + ------- + bool + True if the constraints are valid, False otherwise. + + Raises + ------ + ValueError + If `dims` are needed for build constraints but not specified in the + dataset configuration. + """ + if constraint_type in algos_conf[algo]["constraints"]: + importable = algos_conf[algo]["constraints"][constraint_type] + module, func = ( + ".".join(importable.split(".")[:-1]), + importable.split(".")[-1], + ) + validator = import_module(module) + constraints_func = getattr(validator, func) + if constraint_type == "build": + return constraints_func(param, dims) + else: + return constraints_func(param, build_param, k, batch_size) + return True + + +def run_benchmark( + subset_size: int, + count: int, + batch_size: int, + dataset_configuration: Optional[str], + configuration: Optional[str], + dataset: str, + dataset_path: str, + build: Optional[bool], + search: Optional[bool], + algorithms: Optional[str], + groups: str, + algo_groups: Optional[str], + force: bool, + search_mode: str, + search_threads: int, + dry_run: bool, + raft_log_level: int, +) -> None: + """ + Runs a benchmarking process based on the provided configurations. + + Parameters + ---------- + subset_size : int + The subset size of the dataset. + count : int + The number of nearest neighbors to search for. + batch_size : int + The size of each batch for processing. + dataset_configuration : Optional[str] + Path to the dataset configuration file. + configuration : Optional[str] + Path to the algorithm configuration directory or file. + dataset : str + The name of the dataset to use. + dataset_path : str + The path to the dataset directory. + build : Optional[bool] + Whether to build the indices. + search : Optional[bool] + Whether to perform the search. + algorithms : Optional[str] + Comma-separated list of algorithm names to use. + groups : str + Comma-separated list of groups to consider. + algo_groups : Optional[str] + Comma-separated list of algorithm groups to consider. + force : bool + Whether to force the execution regardless of warnings. + search_mode : str + The mode of search to perform. + search_threads : int + The number of threads to use for searching. + dry_run : bool + Whether to perform a dry run without actual execution. + raft_log_level : int + The logging level for the RAFT library. + + Returns + ------- + None + """ + scripts_path = os.path.dirname(os.path.realpath(__file__)) + gpu_present = rmm_present() + + if not build and not search: + build, search = True, True + + dataset_conf_all = load_yaml_file( + dataset_configuration + or os.path.join(scripts_path, "../config/datasets", "datasets.yaml") + ) + dataset_conf = get_dataset_configuration(dataset, dataset_conf_all) + conf_file = prepare_conf_file(dataset_conf, subset_size, count, batch_size) + algos_conf_fs = gather_algorithm_configs(scripts_path, configuration) + + allowed_algos = algorithms.split(",") if algorithms else None + allowed_algo_groups = ( + [algo_group.split(".") for algo_group in algo_groups.split(",")] + if algo_groups + else None + ) + algos_conf = load_algorithms_conf( + algos_conf_fs, + allowed_algos, + list(zip(*allowed_algo_groups)) if allowed_algo_groups else None, + ) + + executables_to_run = prepare_executables( + algos_conf, + load_yaml_file( + os.path.join(scripts_path, "../config", "algorithms.yaml") + ), + gpu_present, + conf_file, + dataset_path, + dataset, + count, + batch_size, + ) + + cuvs_bench_cpp( + conf_file, + dataset, + os.path.dirname(configuration) + if configuration and os.path.isfile(configuration) + else os.path.join(scripts_path, "conf", "algos"), + executables_to_run, + dataset_path, + force, + build, + search, + dry_run, + count, + batch_size, + search_threads, + search_mode, + raft_log_level, + ) diff --git a/python/cuvs_bench/cuvs_bench/run/runners.py b/python/cuvs_bench/cuvs_bench/run/runners.py new file mode 100644 index 000000000..5a540d2e5 --- /dev/null +++ b/python/cuvs_bench/cuvs_bench/run/runners.py @@ -0,0 +1,273 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import json +import os +import subprocess +import uuid +from typing import Dict, List, Optional, Tuple + + +def cuvs_bench_cpp( + conf_file: Dict, + conf_filename: str, + conf_filedir: str, + executables_to_run: Dict[ + Tuple[str, str, Tuple[str, str]], Dict[str, List[Dict]] + ], + dataset_path: str, + force: bool, + build: bool, + search: bool, + dry_run: bool, + k: int, + batch_size: int, + search_threads: Optional[int], + mode: str = "throughput", + raft_log_level: str = "info", +) -> None: + """ + Run the CUVS benchmarking tool with the provided configuration. + + Parameters + ---------- + conf_file : Dict + The configuration file content. + conf_filename : str + The name of the configuration file. + conf_filedir : str + The directory of the configuration file. + executables_to_run : Dict[Tuple[str, str, Tuple[str, str]], + Dict[str, List[Dict]]] + Dictionary of executables to run and their configurations. + dataset_path : str + The path to the dataset. + force : bool + Whether to force the execution regardless of existing results. + build : bool + Whether to build the indices. + search : bool + Whether to perform the search. + dry_run : bool + Whether to perform a dry run without actual execution. + k : int + The number of nearest neighbors to search for. + batch_size : int + The size of each batch for processing. + search_threads : Optional[int] + The number of threads to use for searching. + mode : str, optional + The mode of search to perform ('latency' or 'throughput'), + by default 'throughput'. + raft_log_level : str, optional + The logging level for the RAFT library, by default 'info'. + + Returns + ------- + None + """ + for ( + executable, + ann_executable_path, + output_filename, + ) in executables_to_run.keys(): + # Need to write temporary configuration + temp_conf_filename = ( + f"{conf_filename}_{output_filename[1]}_{uuid.uuid1()}.json" + ) + with open(temp_conf_filename, "w") as f: + temp_conf = { + "dataset": conf_file["dataset"], + "search_basic_param": conf_file["search_basic_param"], + "index": executables_to_run[ + (executable, ann_executable_path, output_filename) + ]["index"], + } + json_str = json.dumps(temp_conf, indent=2) + f.write(json_str) + + legacy_result_folder = os.path.join( + dataset_path, conf_file["dataset"]["name"], "result" + ) + os.makedirs(legacy_result_folder, exist_ok=True) + + if build: + build_folder = os.path.join(legacy_result_folder, "build") + os.makedirs(build_folder, exist_ok=True) + build_file = f"{output_filename[0]}.json" + temp_build_file = f"{build_file}.lock" + benchmark_out = os.path.join(build_folder, temp_build_file) + cmd = [ + ann_executable_path, + "--build", + f"--data_prefix={dataset_path}", + "--benchmark_out_format=json", + "--benchmark_counters_tabular=true", + f"--benchmark_out={os.path.join(benchmark_out)}", + f"--raft_log_level={parse_log_level(raft_log_level)}", + ] + if force: + cmd.append("--force") + cmd.append(temp_conf_filename) + + if dry_run: + print( + f"Benchmark command for {output_filename[0]}:\n" + f"{' '.join(cmd)}\n" + ) + else: + try: + subprocess.run(cmd, check=True) + merge_build_files( + build_folder, build_file, temp_build_file + ) + except Exception as e: + print(f"Error occurred running benchmark: {e}") + finally: + os.remove(os.path.join(build_folder, temp_build_file)) + if not search: + os.remove(temp_conf_filename) + + if search: + search_folder = os.path.join(legacy_result_folder, "search") + os.makedirs(search_folder, exist_ok=True) + search_file = f"{output_filename[1]}.json" + cmd = [ + ann_executable_path, + "--search", + f"--data_prefix={dataset_path}", + "--benchmark_counters_tabular=true", + f"--override_kv=k:{k}", + f"--override_kv=n_queries:{batch_size}", + "--benchmark_min_warmup_time=1", + "--benchmark_out_format=json", + f"--mode={mode}", + f"--benchmark_out={os.path.join(search_folder, search_file)}", + f"--raft_log_level={parse_log_level(raft_log_level)}", + ] + if force: + cmd.append("--force") + if search_threads: + cmd.append(f"--threads={search_threads}") + cmd.append(temp_conf_filename) + + if dry_run: + print( + f"Benchmark command for {output_filename[1]}:\n" + f"{' '.join(cmd)}\n" + ) + else: + try: + subprocess.run(cmd, check=True) + except Exception as e: + print(f"Error occurred running benchmark: {e}") + finally: + os.remove(temp_conf_filename) + + +log_levels = { + "off": 0, + "error": 1, + "warn": 2, + "info": 3, + "debug": 4, + "trace": 5, +} + + +def parse_log_level(level_str: str) -> int: + """ + Parse the log level from string to integer. + + Parameters + ---------- + level_str : str + The log level as a string. + + Returns + ------- + int + The corresponding integer value of the log level. + + Raises + ------ + ValueError + If the log level string is invalid. + """ + if level_str not in log_levels: + raise ValueError(f"Invalid log level: {level_str}") + return log_levels[level_str.lower()] + + +def merge_build_files( + build_dir: str, build_file: str, temp_build_file: str +) -> None: + """ + Merge temporary build files into the main build file. + + Parameters + ---------- + build_dir : str + The directory of the build files. + build_file : str + The main build file. + temp_build_file : str + The temporary build file to merge. + + Returns + ------- + None + + Raises + ------ + ValueError + If the temporary build file is not found. + """ + build_dict = {} + + # If build file exists, read it + build_json_path = os.path.join(build_dir, build_file) + tmp_build_json_path = os.path.join(build_dir, temp_build_file) + if os.path.isfile(build_json_path): + try: + with open(build_json_path, "r") as f: + build_dict = json.load(f) + except Exception as e: + print( + f"Error loading existing build file: {build_json_path} ({e})" + ) + + temp_build_dict = {} + if os.path.isfile(tmp_build_json_path): + with open(tmp_build_json_path, "r") as f: + temp_build_dict = json.load(f) + else: + raise ValueError(f"Temp build file not found: {tmp_build_json_path}") + + tmp_benchmarks = temp_build_dict.get("benchmarks", {}) + benchmarks = build_dict.get("benchmarks", {}) + + # If the build time is absolute 0 then an error occurred + final_bench_dict = {b["name"]: b for b in benchmarks if b["real_time"] > 0} + + for tmp_bench in tmp_benchmarks: + if tmp_bench["real_time"] > 0: + final_bench_dict[tmp_bench["name"]] = tmp_bench + + temp_build_dict["benchmarks"] = list(final_bench_dict.values()) + with open(build_json_path, "w") as f: + json_str = json.dumps(temp_build_dict, indent=2) + f.write(json_str) diff --git a/python/cuvs_bench/cuvs_bench/tests/test_run.py b/python/cuvs_bench/cuvs_bench/tests/test_run.py new file mode 100644 index 000000000..7b7a481a0 --- /dev/null +++ b/python/cuvs_bench/cuvs_bench/tests/test_run.py @@ -0,0 +1,227 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import itertools +from unittest.mock import MagicMock, mock_open, patch + +import pytest +from benchmark import ( + find_executable, + gather_algorithm_configs, + get_dataset_configuration, + load_algorithms_conf, + load_yaml_file, + prepare_conf_file, + prepare_executables, + prepare_indexes, + rmm_present, + validate_algorithm, + validate_constraints, + validate_search_params, +) + + +def test_load_yaml_file(): + yaml_content = """ + key: value + """ + with patch("builtins.open", mock_open(read_data=yaml_content)): + result = load_yaml_file("dummy_path.yaml") + assert result == {"key": "value"} + + +def test_get_dataset_configuration(): + dataset_conf_all = [{"name": "dataset1"}, {"name": "dataset2"}] + result = get_dataset_configuration("dataset1", dataset_conf_all) + assert result == {"name": "dataset1"} + with pytest.raises(ValueError): + get_dataset_configuration("non_existent_dataset", dataset_conf_all) + + +def test_prepare_conf_file(): + dataset_conf = {"name": "dataset1"} + result = prepare_conf_file(dataset_conf, 1000, 10, 128) + expected_result = { + "dataset": {"name": "dataset1", "subset_size": 1000}, + "search_basic_param": {"k": 10, "batch_size": 128}, + } + assert result == expected_result + result_no_subset = prepare_conf_file(dataset_conf, None, 10, 128) + assert result_no_subset["dataset"].get("subset_size") is None + + +def test_gather_algorithm_configs(tmpdir): + scripts_path = tmpdir.mkdir("scripts") + algos_path = scripts_path.mkdir("algos") + algos_path.join("algo1.yaml").write("key: value") + algos_path.join("algo2.yaml").write("key: value") + result = gather_algorithm_configs(str(scripts_path), None) + assert len(result) == 2 + + custom_conf_dir = tmpdir.mkdir("custom_conf") + custom_conf_dir.join("custom_algo.yaml").write("key: value") + result = gather_algorithm_configs(str(scripts_path), str(custom_conf_dir)) + assert len(result) == 3 + + custom_conf_file = custom_conf_dir.join("custom_algo_file.yaml") + custom_conf_file.write("key: value") + result = gather_algorithm_configs(str(scripts_path), str(custom_conf_file)) + assert len(result) == 4 + + +def test_load_algorithms_conf(): + algos_conf_fs = ["path/to/algo1.yaml", "path/to/algo2.yaml"] + yaml_content = """ + name: algo1 + groups: + group1: {} + """ + with patch("builtins.open", mock_open(read_data=yaml_content)): + result = load_algorithms_conf(algos_conf_fs, None, None) + assert "algo1" in result + + with patch("builtins.open", mock_open(read_data=yaml_content)): + result = load_algorithms_conf(algos_conf_fs, ["algo1"], None) + assert "algo1" in result + result = load_algorithms_conf(algos_conf_fs, ["algo2"], None) + assert "algo1" not in result + + +@patch( + "benchmark.find_executable", + return_value=("executable", "path", "filename"), +) +@patch("benchmark.validate_algorithm", return_value=True) +@patch( + "benchmark.prepare_indexes", return_value=[{"index_key": "index_value"}] +) +def test_prepare_executables( + mock_prepare_indexes, mock_validate_algorithm, mock_find_executable +): + algos_conf = {"algo1": {"groups": {"group1": {"build": {}, "search": {}}}}} + algos_yaml = {"algo1": {}} + gpu_present = True + conf_file = {} + dataset_path = "dataset_path" + dataset = "dataset" + count = 10 + batch_size = 128 + result = prepare_executables( + algos_conf, + algos_yaml, + gpu_present, + conf_file, + dataset_path, + dataset, + count, + batch_size, + ) + assert "executable" in result + assert len(result["executable"]["index"]) == 1 + + +def test_prepare_indexes(): + group_conf = {"build": {"param1": [1, 2]}, "search": {"param2": [3, 4]}} + conf_file = {"dataset": {"dims": 128}} + result = prepare_indexes( + group_conf, + "algo", + "group", + conf_file, + "dataset_path", + "dataset", + 10, + 128, + ) + assert len(result) == 2 + assert "param1" in result[0]["build_param"] + + +def test_validate_search_params(): + all_search_params = itertools.product([1, 2], [3, 4]) + search_param_names = ["param1", "param2"] + group_conf = {} + conf_file = {"dataset": {"dims": 128}} + result = validate_search_params( + all_search_params, + search_param_names, + "algo", + group_conf, + conf_file, + 10, + 128, + ) + assert len(result) == 4 + + +def test_rmm_present(): + with patch.dict("sys.modules", {"rmm": MagicMock()}): + assert rmm_present() is True + with patch.dict("sys.modules", {"rmm": None}): + assert rmm_present() is False + + +@patch("benchmark.get_build_path", return_value="build_path") +def test_find_executable(mock_get_build_path): + algos_conf = {"algo1": {"executable": "executable1"}} + result = find_executable(algos_conf, "algo1", "group1", 10, 128) + assert result == ( + "executable1", + "build_path", + ("algo1,group1", "algo1,group1,k10,bs128"), + ) + mock_get_build_path.return_value = None + with pytest.raises(FileNotFoundError): + find_executable(algos_conf, "algo1", "group1", 10, 128) + + +def test_validate_algorithm(): + algos_conf = {"algo1": {"requires_gpu": False}} + result = validate_algorithm(algos_conf, "algo1", gpu_present=True) + assert result is True + result = validate_algorithm(algos_conf, "algo1", gpu_present=False) + assert result is True + algos_conf["algo1"]["requires_gpu"] = True + result = validate_algorithm(algos_conf, "algo1", gpu_present=False) + assert result is False + + +@patch("benchmark.import_module") +def test_validate_constraints(mock_import_module): + mock_validator = MagicMock() + mock_import_module.return_value = mock_validator + mock_validator.constraint_func.return_value = True + algos_conf = { + "algo1": {"constraints": {"build": "module.constraint_func"}} + } + result = validate_constraints( + algos_conf, "algo1", "build", {"param1": "value1"}, 128, None, None + ) + assert result is True + + algos_conf = {"algo1": {"constraints": {}}} + result = validate_constraints( + algos_conf, "algo1", "build", {"param1": "value1"}, 128, None, None + ) + assert result is True + + mock_validator.constraint_func.return_value = False + algos_conf["algo1"]["constraints"]["build"] = "module.constraint_func" + result = validate_constraints( + algos_conf, "algo1", "build", {"param1": "value1"}, 128, None, None + ) + assert result is False diff --git a/python/cuvs_bench/pyproject.toml b/python/cuvs_bench/pyproject.toml index 7bb9e2f8d..41ebad116 100644 --- a/python/cuvs_bench/pyproject.toml +++ b/python/cuvs_bench/pyproject.toml @@ -18,6 +18,10 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ + "click", + "matplotlib", + "pandas", + "pyyaml", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", @@ -31,10 +35,7 @@ classifiers = [ ] [project.urls] -Homepage = "https://github.com/rapidsai/raft" - -[tool.setuptools.packages.find] -where = ["src"] +Homepage = "https://github.com/rapidsai/cuvs" [tool.setuptools.package-data] "*" = ["*.*", "VERSION"] @@ -64,7 +65,8 @@ skip = [ version = { file = "cuvs_bench/VERSION" } [tool.rapids-build-backend] -build-backend = "scikit_build_core.build" -requires = [] +build-backend = "setuptools.build_meta" +requires = [ +] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. dependencies-file = "../../dependencies.yaml" matrix-entry = "cuda_suffixed=true"