Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add gen_hdf_file.cpp to support create random dataset #164

Merged
merged 1 commit into from
Oct 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,5 @@ benchmark_test(benchmark_float_bitset hdf5/benchmark_float_bitset.cpp)
benchmark_test(benchmark_float_qps hdf5/benchmark_float_qps.cpp)
benchmark_test(benchmark_float_range hdf5/benchmark_float_range.cpp)
benchmark_test(benchmark_float_range_bitset hdf5/benchmark_float_range_bitset.cpp)

benchmark_test(gen_hdf5_file hdf5/gen_hdf5_file.cpp)
105 changes: 18 additions & 87 deletions benchmark/hdf5/benchmark_hdf5.h
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,20 @@ class Benchmark_hdf5 : public Benchmark_base {
return data_out;
}

void
write_hdf5_dataset(hid_t file, const char* dataset_name, hid_t type_id, int32_t rows, int32_t cols,
const void* data) {
hsize_t dims[2];
dims[0] = rows;
dims[1] = cols;
auto dataspace = H5Screate_simple(2, dims, NULL);
auto dataset = H5Dcreate2(file, dataset_name, type_id, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
auto err = H5Dwrite(dataset, type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, data);
assert(err == 0);
H5Dclose(dataset);
H5Sclose(dataspace);
}

// For binary vector, dim should be divided by 32, since we use int32 to store binary vector data */
template <bool is_binary>
void
Expand All @@ -338,31 +352,18 @@ class Benchmark_hdf5 : public Benchmark_base {
/* Open the file and the dataset. */
hid_t file = H5Fcreate(file_name, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);

auto write_hdf5_dataset = [](hid_t file, const char* dataset_name, hid_t type_id, int32_t rows, int32_t cols,
const void* data) {
hsize_t dims[2];
dims[0] = rows;
dims[1] = cols;
auto dataspace = H5Screate_simple(2, dims, NULL);
auto dataset = H5Dcreate2(file, dataset_name, type_id, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
auto err = H5Dwrite(dataset, type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, data);
assert(err == 0);
H5Dclose(dataset);
H5Sclose(dataspace);
};

/* write train dataset */
if (!is_binary) {
write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_FLOAT, nb, dim, xb);
} else {
write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_INT32, nb, dim, xb);
write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_INT32, nb, dim / 32, xb);
}

/* write test dataset */
if (!is_binary) {
write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_FLOAT, nq, dim, xq);
} else {
write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_INT32, nq, dim, xq);
write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_INT32, nq, dim / 32, xq);
}

/* write ground-truth labels dataset */
Expand All @@ -388,31 +389,18 @@ class Benchmark_hdf5 : public Benchmark_base {
/* Open the file and the dataset. */
hid_t file = H5Fcreate(file_name, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);

auto write_hdf5_dataset = [](hid_t file, const char* dataset_name, hid_t type_id, int32_t rows, int32_t cols,
const void* data) {
hsize_t dims[2];
dims[0] = rows;
dims[1] = cols;
auto dataspace = H5Screate_simple(2, dims, NULL);
auto dataset = H5Dcreate2(file, dataset_name, type_id, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
auto err = H5Dwrite(dataset, type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, data);
assert(err == 0);
H5Dclose(dataset);
H5Sclose(dataspace);
};

/* write train dataset */
if (!is_binary) {
write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_FLOAT, nb, dim, xb);
} else {
write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_INT32, nb, dim, xb);
write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_INT32, nb, dim / 32, xb);
}

/* write test dataset */
if (!is_binary) {
write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_FLOAT, nq, dim, xq);
} else {
write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_INT32, nq, dim, xq);
write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_INT32, nq, dim / 32, xq);
}

/* write ground-truth radius */
Expand All @@ -431,63 +419,6 @@ class Benchmark_hdf5 : public Benchmark_base {
H5Fclose(file);
}

// For binary vector, dim should be divided by 32, since we use int32 to store binary vector data */
// Write HDF5 file with following dataset:
// HDF5_DATASET_RADIUS - H5T_NATIVE_FLOAT, [1, nq]
// HDF5_DATASET_LIMS - H5T_NATIVE_INT32, [1, nq+1]
// HDF5_DATASET_NEIGHBORS - H5T_NATIVE_INT32, [1, lims[nq]]
// HDF5_DATASET_DISTANCES - H5T_NATIVE_FLOAT, [1, lims[nq]]
template <bool is_binary>
void
hdf5_write_range(const char* file_name, const int32_t dim, const void* xb, const int32_t nb, const void* xq,
const int32_t nq, const float* g_radius, const void* g_lims, const void* g_ids,
const void* g_dist) {
/* Open the file and the dataset. */
hid_t file = H5Fcreate(file_name, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);

auto write_hdf5_dataset = [](hid_t file, const char* dataset_name, hid_t type_id, int32_t rows, int32_t cols,
const void* data) {
hsize_t dims[2];
dims[0] = rows;
dims[1] = cols;
auto dataspace = H5Screate_simple(2, dims, NULL);
auto dataset = H5Dcreate2(file, dataset_name, type_id, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
auto err = H5Dwrite(dataset, type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, data);
assert(err == 0);
H5Dclose(dataset);
H5Sclose(dataspace);
};

/* write train dataset */
if (!is_binary) {
write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_FLOAT, nb, dim, xb);
} else {
write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_INT32, nb, dim, xb);
}

/* write test dataset */
if (!is_binary) {
write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_FLOAT, nq, dim, xq);
} else {
write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_INT32, nq, dim, xq);
}

/* write ground-truth radius */
write_hdf5_dataset(file, HDF5_DATASET_RADIUS, H5T_NATIVE_FLOAT, 1, nq, g_radius);

/* write ground-truth lims dataset */
write_hdf5_dataset(file, HDF5_DATASET_LIMS, H5T_NATIVE_INT32, 1, nq + 1, g_lims);

/* write ground-truth labels dataset */
write_hdf5_dataset(file, HDF5_DATASET_NEIGHBORS, H5T_NATIVE_INT32, 1, ((int32_t*)g_lims)[nq], g_ids);

/* write ground-truth distance dataset */
write_hdf5_dataset(file, HDF5_DATASET_DISTANCES, H5T_NATIVE_FLOAT, 1, ((int32_t*)g_lims)[nq], g_dist);

/* Close/release resources. */
H5Fclose(file);
}

protected:
std::string ann_test_name_ = "";
std::string metric_str_;
Expand Down
177 changes: 177 additions & 0 deletions benchmark/hdf5/gen_hdf5_file.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
// Copyright (C) 2019-2023 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License.

#include <gtest/gtest.h>

#include <algorithm>
#include <vector>

#include "benchmark_hdf5.h"
#include "knowhere/comp/brute_force.h"
#include "knowhere/comp/index_param.h"
#include "knowhere/comp/knowhere_config.h"
#include "knowhere/dataset.h"

knowhere::DataSetPtr
GenDataSet(int rows, int dim) {
std::mt19937 rng(42);
std::uniform_real_distribution<> distrib(-1.0, 1.0);
float* ts = new float[rows * dim];
cydrain marked this conversation as resolved.
Show resolved Hide resolved
for (int i = 0; i < rows * dim; ++i) {
ts[i] = (float)distrib(rng);
}
auto ds = knowhere::GenDataSet(rows, dim, ts);
ds->SetIsOwner(true);
return ds;
}

knowhere::DataSetPtr
GenBinDataSet(int rows, int dim) {
std::mt19937 rng(42);
std::uniform_int_distribution<> distrib(0, 255);
int uint8_num = dim / 8;
uint8_t* ts = new uint8_t[rows * uint8_num];
for (int i = 0; i < rows * uint8_num; ++i) {
ts[i] = (uint8_t)distrib(rng);
}
auto ds = knowhere::GenDataSet(rows, dim, ts);
ds->SetIsOwner(true);
return ds;
}

class Create_HDF5 : public Benchmark_hdf5, public ::testing::Test {
protected:
void
SetUp() override {
}

void
TearDown() override {
}

template <bool is_binary>
void
create_hdf5_file(const knowhere::MetricType& metric_type, const int64_t nb, const int64_t nq, const int64_t dim,
const int64_t topk) {
std::string metric_str = metric_type;
transform(metric_str.begin(), metric_str.end(), metric_str.begin(), ::tolower);
std::string fn = "rand-" + std::to_string(dim) + "-" + metric_str + ".hdf5";

knowhere::Json json;
json[knowhere::meta::DIM] = dim;
json[knowhere::meta::METRIC_TYPE] = metric_type;
json[knowhere::meta::TOPK] = topk;

knowhere::DataSetPtr xb_ds, xq_ds;
if (is_binary) {
xb_ds = GenBinDataSet(nb, dim);
xq_ds = GenBinDataSet(nq, dim);
} else {
xb_ds = GenDataSet(nb, dim);
xq_ds = GenDataSet(nq, dim);
}

auto result = knowhere::BruteForce::Search(xb_ds, xq_ds, json, nullptr);
assert(result.has_value());

// convert golden_ids to int32
auto elem_cnt = nq * topk;
std::vector<int32_t> gt_ids_int(elem_cnt);
for (int32_t i = 0; i < elem_cnt; i++) {
gt_ids_int[i] = result.value()->GetIds()[i];
}

hdf5_write<is_binary>(fn.c_str(), dim, topk, xb_ds->GetTensor(), nb, xq_ds->GetTensor(), nq, gt_ids_int.data(),
result.value()->GetDistance());
}

template <bool is_binary>
void
create_range_hdf5_file(const knowhere::MetricType& metric_type, const int64_t nb, const int64_t nq,
const int64_t dim, const float radius) {
std::string metric_str = metric_type;
transform(metric_str.begin(), metric_str.end(), metric_str.begin(), ::tolower);
std::string fn = "rand-" + std::to_string(dim) + "-" + metric_str + "-range.hdf5";

knowhere::Json json;
json[knowhere::meta::DIM] = dim;
json[knowhere::meta::METRIC_TYPE] = metric_type;
json[knowhere::meta::RADIUS] = radius;

knowhere::DataSetPtr xb_ds, xq_ds;
if (is_binary) {
xb_ds = GenBinDataSet(nb, dim);
xq_ds = GenBinDataSet(nq, dim);
} else {
xb_ds = GenDataSet(nb, dim);
xq_ds = GenDataSet(nq, dim);
}

auto result = knowhere::BruteForce::RangeSearch(xb_ds, xq_ds, json, nullptr);
assert(result.has_value());

// convert golden_lims to int32
std::vector<int32_t> gt_lims_int(nq + 1);
for (int32_t i = 0; i <= nq; i++) {
gt_lims_int[i] = result.value()->GetLims()[i];
}

// convert golden_ids to int32
auto elem_cnt = result.value()->GetLims()[nq];
std::vector<int32_t> gt_ids_int(elem_cnt);
for (int32_t i = 0; i < elem_cnt; i++) {
gt_ids_int[i] = result.value()->GetIds()[i];
}

hdf5_write_range<is_binary>(fn.c_str(), dim, xb_ds->GetTensor(), nb, xq_ds->GetTensor(), nq, radius,
gt_lims_int.data(), gt_ids_int.data(), result.value()->GetDistance());
}
};

TEST_F(Create_HDF5, CREATE_FLOAT) {
int64_t nb = 10000;
int64_t nq = 100;
int64_t dim = 128;
int64_t topk = 100;

create_hdf5_file<false>(knowhere::metric::L2, nb, nq, dim, topk);
create_hdf5_file<false>(knowhere::metric::IP, nb, nq, dim, topk);
create_hdf5_file<false>(knowhere::metric::COSINE, nb, nq, dim, topk);
}

TEST_F(Create_HDF5, CREATE_FLOAT_RANGE) {
int64_t nb = 10000;
int64_t nq = 100;
int64_t dim = 128;

create_range_hdf5_file<false>(knowhere::metric::L2, nb, nq, dim, 65.0);
create_range_hdf5_file<false>(knowhere::metric::IP, nb, nq, dim, 8.7);
create_range_hdf5_file<false>(knowhere::metric::COSINE, nb, nq, dim, 0.2);
}

TEST_F(Create_HDF5, CREATE_BINARY) {
int64_t nb = 10000;
int64_t nq = 100;
int64_t dim = 1024;
int64_t topk = 100;

create_hdf5_file<true>(knowhere::metric::HAMMING, nb, nq, dim, topk);
create_hdf5_file<true>(knowhere::metric::JACCARD, nb, nq, dim, topk);
}

TEST_F(Create_HDF5, CREATE_BINARY_RANGE) {
int64_t nb = 10000;
int64_t nq = 100;
int64_t dim = 1024;

create_range_hdf5_file<true>(knowhere::metric::HAMMING, nb, nq, dim, 476);
create_range_hdf5_file<true>(knowhere::metric::JACCARD, nb, nq, dim, 0.63);
}
Loading