From 5d719b2f9fd5ad4ccdfbac6d5e0c76a98fb000a2 Mon Sep 17 00:00:00 2001 From: weijian Date: Mon, 13 May 2024 11:25:04 +0000 Subject: [PATCH 1/6] YFCC --- CMakeLists.txt | 3 +- glass/hnsw/hnsw.hpp | 27 ++++++++ main.cpp | 161 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 190 insertions(+), 1 deletion(-) create mode 100644 main.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index de0a37b..0041bfd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,6 +9,7 @@ set(CXX_STANDARD 20) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) SET(CMAKE_CXX_FLAGS "-Wall -Wextra -O3 -lrt -std=c++20 -march=native -fpic -fopenmp -ftree-vectorize -fno-exceptions -fno-rtti" ) +#SET(CMAKE_CXX_FLAGS "-Wall -Wextra -O0 -g -lrt -std=c++20 -march=native -fopenmp" ) -add_executable(main examples/main.cc) +add_executable(main main.cpp) target_link_libraries(main glass) diff --git a/glass/hnsw/hnsw.hpp b/glass/hnsw/hnsw.hpp index 8daf863..46b1a93 100644 --- a/glass/hnsw/hnsw.hpp +++ b/glass/hnsw/hnsw.hpp @@ -75,6 +75,33 @@ struct HNSW : public Builder { } } final_graph.initializer = std::move(initializer); + +//#pragma omp parallel for +// for (int i = 0; i < nb; ++i) { +// auto internal_id = hnsw->label_lookup_[i]; +// int *edges = (int *)hnsw->get_linklist0(internal_id); +// for (int j = 1; j <= edges[0]; ++j) { +// int external_id = hnsw->getExternalLabel(edges[j]); +// final_graph.at(i, j - 1) = external_id; +// } +// } +// auto initializer = std::make_unique(nb, M); +// initializer->ep = hnsw->getExternalLabel(hnsw->enterpoint_node_); +// for (int i = 0; i < nb; ++i) { +// auto internal_id = hnsw->label_lookup_[i]; +// int level = hnsw->element_levels_[internal_id]; +// initializer->levels[i] = level; +// if (level > 0) { +// initializer->lists[i].assign(level * M, -1); +// for (int j = 1; j <= level; ++j) { +// int *edges = (int *)hnsw->get_linklist(internal_id, j); +// for (int k = 1; k <= edges[0]; ++k) { +// initializer->at(j, i, k - 1) = hnsw->getExternalLabel(edges[k]); +// } +// } +// } +// } +// final_graph.initializer = std::move(initializer); } Graph GetGraph() override { return final_graph; } diff --git a/main.cpp b/main.cpp new file mode 100644 index 0000000..be2ffca --- /dev/null +++ b/main.cpp @@ -0,0 +1,161 @@ +// +// Created by weijian on 5/13/24. +// + +#include +#include +#include +#include +#include +#include +#include + +#include "glass/searcher.hpp" +#include "glass/quant/fp32_quant.hpp" +#include "glass/hnsw/hnsw.hpp" + +using std::cout; +using std::endl; +using std::string; +using std::vector; + +///// @brief Reading binary data vectors. Raw data store as a (N x dim) +///// @param file_path file path of binary data +///// @param data returned 2D data vectors +//template +//void ReadBin(const std::string &file_path, +// std::vector> &data) { +// std::cout << "Reading Data: " << file_path << std::endl; +// std::ifstream ifs; +// ifs.open(file_path, std::ios::binary); +// assert(ifs.is_open()); +// unsigned N; // num of points +// unsigned num_dimensions; +// ifs.read((char *)&N, sizeof(unsigned)); +// ifs.read((char *)&num_dimensions, sizeof(unsigned)); +// data.resize(N); +// std::cout << "# of points: " << N << std::endl; +// std::cout << "# of dimensions: " << num_dimensions << std::endl; +// std::vector buff(num_dimensions); +// int counter = 0; +// while (ifs.read((char *)buff.data(), num_dimensions * sizeof(T))) { +// std::vector row(num_dimensions); +// for (int d = 0; d < num_dimensions; d++) { +// row[d] = static_cast(buff[d]); +// } +// data[counter++] = std::move(row); +// } +// ifs.close(); +// std::cout << "Finish Reading Data" << endl; +//} + +/// @brief Reading binary data vectors. Raw data store as a (N x dim) +/// @param file_path file path of binary data +/// @param data returned 2D data vectors +template +void ReadBin(const std::string &file_path, + const int num_dimensions, + std::vector> &data) { + std::cout << "Reading Data: " << file_path << std::endl; + std::ifstream ifs; + ifs.open(file_path, std::ios::binary); + assert(ifs.is_open()); + unsigned N; // num of points + ifs.read((char *)&N, sizeof(unsigned)); + data.resize(N); + std::cout << "# of points: " << N << std::endl; + std::vector buff(num_dimensions); + int counter = 0; + while (ifs.read((char *)buff.data(), num_dimensions * sizeof(T))) { + std::vector row(num_dimensions); + for (int d = 0; d < num_dimensions; d++) { + row[d] = static_cast(buff[d]); + } + data[counter++] = std::move(row); + } + ifs.close(); + std::cout << "Finish Reading Data" << endl; +} + +int main() { +// string data_file = "/data/deep1b/base.1B.fbin.crop_nb_10000000"; +// string query_file = "/data/deep1b/query.public.10K.fbin"; +// string gt_file = "/data/deep1b/deep-10M"; + const string data_file = "/dataset/sigmod2024/medium/contest-data-release-1m.bin", + query_file = "/dataset/sigmod2024/medium/contest-queries-release-1m.bin", + gt_file = "/dataset/sigmod2024/medium/contest-gt-release-1m.bin"; +// const string data_file = "/dataset/sigmod2024/large/contest-data-release-10m.bin", +// query_file = "/dataset/sigmod2024/large/contest-queries-release-10m.bin", +// gt_file = "/dataset/sigmod2024/large/contest-gt-release-10m.bin"; + + vector> data{}, queries{}; + vector> GT{}; + + vector data_buf{}; + + ReadBin(data_file, 102, data); + ReadBin(query_file, 104, queries); + ReadBin(gt_file, 100, GT); + + int nb = data.size(), d = 100; + int nq = queries.size(); + int k = 100; + vector> output(nq, vector(k, 0)); + + bool update_index = true; + + cout << nb << ", " << d << endl; + cout << nq << endl; + cout << nq << ", " << queries.front().size() << endl; + cout << output.size() << ", " << output.front().size() << endl; + data_buf.resize(nb * d); +#pragma omp parallel for + for(int i = 0; i < nb; i++) { + std::memcpy(data_buf.data() + i * 100, data[i].data() + 2, d * 4); + } + + glass::HNSW index(d, "L2"); + if(update_index) { + index.Build(data_buf.data(), nb); + index.final_graph.save("hnsw_index_glass"); + } else { + index.final_graph.load("hnsw_index_glass"); + } + glass::Searcher> searcher(index.final_graph); + searcher.SetData(data_buf.data(), nb, d); + searcher.SetEf(500); + searcher.Optimize(96); + cout << "11111" << endl; + + auto start = std::chrono::high_resolution_clock::now(); +#pragma omp parallel for schedule(dynamic) + for(int i = 0; i < nq; i++) { + if(queries[i][0] == 0) { + searcher.Search(queries[i].data() + 4, k, output[i].data()); + } + } + auto end = std::chrono::high_resolution_clock::now(); + cout << "search time: " << std::chrono::duration(end - start).count() << " s" << std::endl; + + std::atomic total_coselection{0}; + std::atomic total_num{0}; +#pragma omp parallel for +// int total_coselection = 0; +// int total_num = 0; + for (int i = 0; i < nq; i++) { + if(queries[i][0] == 0) { + int cur_coselection = 0; + std::set gt(GT[i].begin(), GT[i].end()); + std::set res(output[i].begin(), output[i].end()); + for (auto item: res) { + if (gt.find(item) != gt.end()) { + cur_coselection++; + } + } + total_num += 1; + total_coselection += cur_coselection; + } + } + + std::cout << "recall = " << (double) total_coselection * 100 / (total_num * 100) << " %" << std::endl; +} \ No newline at end of file From a19950732d28a816171e3a6a86aa9fa82506dfa0 Mon Sep 17 00:00:00 2001 From: weijian Date: Mon, 13 May 2024 11:39:35 +0000 Subject: [PATCH 2/6] fix --- main.cpp | 113 +++++++++++++++++++++++++++---------------------------- 1 file changed, 56 insertions(+), 57 deletions(-) diff --git a/main.cpp b/main.cpp index be2ffca..7cff5b7 100644 --- a/main.cpp +++ b/main.cpp @@ -19,51 +19,23 @@ using std::endl; using std::string; using std::vector; -///// @brief Reading binary data vectors. Raw data store as a (N x dim) -///// @param file_path file path of binary data -///// @param data returned 2D data vectors -//template -//void ReadBin(const std::string &file_path, -// std::vector> &data) { -// std::cout << "Reading Data: " << file_path << std::endl; -// std::ifstream ifs; -// ifs.open(file_path, std::ios::binary); -// assert(ifs.is_open()); -// unsigned N; // num of points -// unsigned num_dimensions; -// ifs.read((char *)&N, sizeof(unsigned)); -// ifs.read((char *)&num_dimensions, sizeof(unsigned)); -// data.resize(N); -// std::cout << "# of points: " << N << std::endl; -// std::cout << "# of dimensions: " << num_dimensions << std::endl; -// std::vector buff(num_dimensions); -// int counter = 0; -// while (ifs.read((char *)buff.data(), num_dimensions * sizeof(T))) { -// std::vector row(num_dimensions); -// for (int d = 0; d < num_dimensions; d++) { -// row[d] = static_cast(buff[d]); -// } -// data[counter++] = std::move(row); -// } -// ifs.close(); -// std::cout << "Finish Reading Data" << endl; -//} - /// @brief Reading binary data vectors. Raw data store as a (N x dim) /// @param file_path file path of binary data /// @param data returned 2D data vectors template void ReadBin(const std::string &file_path, - const int num_dimensions, std::vector> &data) { std::cout << "Reading Data: " << file_path << std::endl; std::ifstream ifs; ifs.open(file_path, std::ios::binary); assert(ifs.is_open()); unsigned N; // num of points + unsigned num_dimensions; ifs.read((char *)&N, sizeof(unsigned)); + ifs.read((char *)&num_dimensions, sizeof(unsigned)); data.resize(N); std::cout << "# of points: " << N << std::endl; + std::cout << "# of dimensions: " << num_dimensions << std::endl; std::vector buff(num_dimensions); int counter = 0; while (ifs.read((char *)buff.data(), num_dimensions * sizeof(T))) { @@ -77,32 +49,63 @@ void ReadBin(const std::string &file_path, std::cout << "Finish Reading Data" << endl; } +///// @brief Reading binary data vectors. Raw data store as a (N x dim) +///// @param file_path file path of binary data +///// @param data returned 2D data vectors +//template +//void ReadBin(const std::string &file_path, +// const int num_dimensions, +// std::vector> &data) { +// std::cout << "Reading Data: " << file_path << std::endl; +// std::ifstream ifs; +// ifs.open(file_path, std::ios::binary); +// assert(ifs.is_open()); +// unsigned N; // num of points +// ifs.read((char *)&N, sizeof(unsigned)); +// data.resize(N); +// std::cout << "# of points: " << N << std::endl; +// std::vector buff(num_dimensions); +// int counter = 0; +// while (ifs.read((char *)buff.data(), num_dimensions * sizeof(T))) { +// std::vector row(num_dimensions); +// for (int d = 0; d < num_dimensions; d++) { +// row[d] = static_cast(buff[d]); +// } +// data[counter++] = std::move(row); +// } +// ifs.close(); +// std::cout << "Finish Reading Data" << endl; +//} + int main() { -// string data_file = "/data/deep1b/base.1B.fbin.crop_nb_10000000"; -// string query_file = "/data/deep1b/query.public.10K.fbin"; -// string gt_file = "/data/deep1b/deep-10M"; - const string data_file = "/dataset/sigmod2024/medium/contest-data-release-1m.bin", - query_file = "/dataset/sigmod2024/medium/contest-queries-release-1m.bin", - gt_file = "/dataset/sigmod2024/medium/contest-gt-release-1m.bin"; + string data_file = "/data/deep1b/base.1B.fbin.crop_nb_10000000"; + string query_file = "/data/deep1b/query.public.10K.fbin"; + string gt_file = "/data/deep1b/deep-10M"; +// const string data_file = "/dataset/sigmod2024/medium/contest-data-release-1m.bin", +// query_file = "/dataset/sigmod2024/medium/contest-queries-release-1m.bin", +// gt_file = "/dataset/sigmod2024/medium/contest-gt-release-1m.bin"; // const string data_file = "/dataset/sigmod2024/large/contest-data-release-10m.bin", // query_file = "/dataset/sigmod2024/large/contest-queries-release-10m.bin", // gt_file = "/dataset/sigmod2024/large/contest-gt-release-10m.bin"; vector> data{}, queries{}; - vector> GT{}; + vector> GT{}; vector data_buf{}; - ReadBin(data_file, 102, data); - ReadBin(query_file, 104, queries); - ReadBin(gt_file, 100, GT); +// ReadBin(data_file, 102, data); +// ReadBin(query_file, 104, queries); +// ReadBin(gt_file, 100, GT); + ReadBin(data_file, data); + ReadBin(query_file, queries); + ReadBin(gt_file, GT); - int nb = data.size(), d = 100; + int nb = data.size(), d = data.front().size(); int nq = queries.size(); int k = 100; vector> output(nq, vector(k, 0)); - bool update_index = true; + bool update_index = false; cout << nb << ", " << d << endl; cout << nq << endl; @@ -111,7 +114,7 @@ int main() { data_buf.resize(nb * d); #pragma omp parallel for for(int i = 0; i < nb; i++) { - std::memcpy(data_buf.data() + i * 100, data[i].data() + 2, d * 4); + std::memcpy(data_buf.data() + i * 100, data[i].data(), d * 4); } glass::HNSW index(d, "L2"); @@ -130,9 +133,7 @@ int main() { auto start = std::chrono::high_resolution_clock::now(); #pragma omp parallel for schedule(dynamic) for(int i = 0; i < nq; i++) { - if(queries[i][0] == 0) { - searcher.Search(queries[i].data() + 4, k, output[i].data()); - } + searcher.Search(queries[i].data(), k, output[i].data()); } auto end = std::chrono::high_resolution_clock::now(); cout << "search time: " << std::chrono::duration(end - start).count() << " s" << std::endl; @@ -143,18 +144,16 @@ int main() { // int total_coselection = 0; // int total_num = 0; for (int i = 0; i < nq; i++) { - if(queries[i][0] == 0) { - int cur_coselection = 0; - std::set gt(GT[i].begin(), GT[i].end()); - std::set res(output[i].begin(), output[i].end()); - for (auto item: res) { - if (gt.find(item) != gt.end()) { - cur_coselection++; - } + int cur_coselection = 0; + std::set gt(GT[i].begin(), GT[i].end()); + std::set res(output[i].begin(), output[i].end()); + for (auto item: res) { + if (gt.find(static_cast(item)) != gt.end()) { + cur_coselection++; } - total_num += 1; - total_coselection += cur_coselection; } + total_num += 1; + total_coselection += cur_coselection; } std::cout << "recall = " << (double) total_coselection * 100 / (total_num * 100) << " %" << std::endl; From 840745b10b4d9e1ccbcd4b0f60f60c49f4119578 Mon Sep 17 00:00:00 2001 From: weijian Date: Mon, 13 May 2024 12:37:33 +0000 Subject: [PATCH 3/6] fix --- .gitignore | 1 + CMakeLists.txt | 1 - main.cpp | 65 +++++++------------------------------------------- 3 files changed, 10 insertions(+), 57 deletions(-) diff --git a/.gitignore b/.gitignore index 9785597..068de5c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ build .cache +cmake-build-release diff --git a/CMakeLists.txt b/CMakeLists.txt index 0041bfd..6c67c74 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,7 +9,6 @@ set(CXX_STANDARD 20) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) SET(CMAKE_CXX_FLAGS "-Wall -Wextra -O3 -lrt -std=c++20 -march=native -fpic -fopenmp -ftree-vectorize -fno-exceptions -fno-rtti" ) -#SET(CMAKE_CXX_FLAGS "-Wall -Wextra -O0 -g -lrt -std=c++20 -march=native -fopenmp" ) add_executable(main main.cpp) target_link_libraries(main glass) diff --git a/main.cpp b/main.cpp index 7cff5b7..e3a7207 100644 --- a/main.cpp +++ b/main.cpp @@ -38,7 +38,8 @@ void ReadBin(const std::string &file_path, std::cout << "# of dimensions: " << num_dimensions << std::endl; std::vector buff(num_dimensions); int counter = 0; - while (ifs.read((char *)buff.data(), num_dimensions * sizeof(T))) { + for(int i = 0; i < N; i++) { + ifs.read((char *)buff.data(), num_dimensions * sizeof(T)); std::vector row(num_dimensions); for (int d = 0; d < num_dimensions; d++) { row[d] = static_cast(buff[d]); @@ -49,53 +50,18 @@ void ReadBin(const std::string &file_path, std::cout << "Finish Reading Data" << endl; } -///// @brief Reading binary data vectors. Raw data store as a (N x dim) -///// @param file_path file path of binary data -///// @param data returned 2D data vectors -//template -//void ReadBin(const std::string &file_path, -// const int num_dimensions, -// std::vector> &data) { -// std::cout << "Reading Data: " << file_path << std::endl; -// std::ifstream ifs; -// ifs.open(file_path, std::ios::binary); -// assert(ifs.is_open()); -// unsigned N; // num of points -// ifs.read((char *)&N, sizeof(unsigned)); -// data.resize(N); -// std::cout << "# of points: " << N << std::endl; -// std::vector buff(num_dimensions); -// int counter = 0; -// while (ifs.read((char *)buff.data(), num_dimensions * sizeof(T))) { -// std::vector row(num_dimensions); -// for (int d = 0; d < num_dimensions; d++) { -// row[d] = static_cast(buff[d]); -// } -// data[counter++] = std::move(row); -// } -// ifs.close(); -// std::cout << "Finish Reading Data" << endl; -//} + int main() { + cout << sizeof(unsigned) << ", " << sizeof(int64_t) << endl; string data_file = "/data/deep1b/base.1B.fbin.crop_nb_10000000"; string query_file = "/data/deep1b/query.public.10K.fbin"; string gt_file = "/data/deep1b/deep-10M"; -// const string data_file = "/dataset/sigmod2024/medium/contest-data-release-1m.bin", -// query_file = "/dataset/sigmod2024/medium/contest-queries-release-1m.bin", -// gt_file = "/dataset/sigmod2024/medium/contest-gt-release-1m.bin"; -// const string data_file = "/dataset/sigmod2024/large/contest-data-release-10m.bin", -// query_file = "/dataset/sigmod2024/large/contest-queries-release-10m.bin", -// gt_file = "/dataset/sigmod2024/large/contest-gt-release-10m.bin"; vector> data{}, queries{}; - vector> GT{}; - + vector> GT{}; vector data_buf{}; -// ReadBin(data_file, 102, data); -// ReadBin(query_file, 104, queries); -// ReadBin(gt_file, 100, GT); ReadBin(data_file, data); ReadBin(query_file, queries); ReadBin(gt_file, GT); @@ -104,31 +70,20 @@ int main() { int nq = queries.size(); int k = 100; vector> output(nq, vector(k, 0)); - - bool update_index = false; - - cout << nb << ", " << d << endl; - cout << nq << endl; - cout << nq << ", " << queries.front().size() << endl; - cout << output.size() << ", " << output.front().size() << endl; data_buf.resize(nb * d); #pragma omp parallel for for(int i = 0; i < nb; i++) { - std::memcpy(data_buf.data() + i * 100, data[i].data(), d * 4); + std::memcpy(data_buf.data() + i * d, data[i].data(), d * 4); } glass::HNSW index(d, "L2"); - if(update_index) { - index.Build(data_buf.data(), nb); - index.final_graph.save("hnsw_index_glass"); - } else { - index.final_graph.load("hnsw_index_glass"); - } + index.Build(data_buf.data(), nb); + index.final_graph.save("hnsw_index_glass"); + glass::Searcher> searcher(index.final_graph); searcher.SetData(data_buf.data(), nb, d); searcher.SetEf(500); searcher.Optimize(96); - cout << "11111" << endl; auto start = std::chrono::high_resolution_clock::now(); #pragma omp parallel for schedule(dynamic) @@ -141,8 +96,6 @@ int main() { std::atomic total_coselection{0}; std::atomic total_num{0}; #pragma omp parallel for -// int total_coselection = 0; -// int total_num = 0; for (int i = 0; i < nq; i++) { int cur_coselection = 0; std::set gt(GT[i].begin(), GT[i].end()); From 6347f2a60e76dd003f20bf4deaf0966619f29ad9 Mon Sep 17 00:00:00 2001 From: weijian Date: Tue, 14 May 2024 01:46:11 +0000 Subject: [PATCH 4/6] fix --- glass/hnsw/hnsw.hpp | 92 ++++++++++++++++++++++----------------------- main.cpp | 10 +++-- 2 files changed, 52 insertions(+), 50 deletions(-) diff --git a/glass/hnsw/hnsw.hpp b/glass/hnsw/hnsw.hpp index 46b1a93..3329b35 100644 --- a/glass/hnsw/hnsw.hpp +++ b/glass/hnsw/hnsw.hpp @@ -52,56 +52,56 @@ struct HNSW : public Builder { auto ela = std::chrono::duration(ed - st).count(); printf("HNSW building cost: %.2lfs\n", ela); final_graph.init(nb, 2 * M); -#pragma omp parallel for - for (int i = 0; i < nb; ++i) { - int *edges = (int *)hnsw->get_linklist0(i); - for (int j = 1; j <= edges[0]; ++j) { - final_graph.at(i, j - 1) = edges[j]; - } - } - auto initializer = std::make_unique(nb, M); - initializer->ep = hnsw->enterpoint_node_; - for (int i = 0; i < nb; ++i) { - int level = hnsw->element_levels_[i]; - initializer->levels[i] = level; - if (level > 0) { - initializer->lists[i].assign(level * M, -1); - for (int j = 1; j <= level; ++j) { - int *edges = (int *)hnsw->get_linklist(i, j); - for (int k = 1; k <= edges[0]; ++k) { - initializer->at(j, i, k - 1) = edges[k]; - } - } - } - } - final_graph.initializer = std::move(initializer); - //#pragma omp parallel for -// for (int i = 0; i < nb; ++i) { -// auto internal_id = hnsw->label_lookup_[i]; -// int *edges = (int *)hnsw->get_linklist0(internal_id); -// for (int j = 1; j <= edges[0]; ++j) { -// int external_id = hnsw->getExternalLabel(edges[j]); -// final_graph.at(i, j - 1) = external_id; -// } +// for (int i = 0; i < nb; ++i) { +// int *edges = (int *)hnsw->get_linklist0(i); +// for (int j = 1; j <= edges[0]; ++j) { +// final_graph.at(i, j - 1) = edges[j]; // } -// auto initializer = std::make_unique(nb, M); -// initializer->ep = hnsw->getExternalLabel(hnsw->enterpoint_node_); -// for (int i = 0; i < nb; ++i) { -// auto internal_id = hnsw->label_lookup_[i]; -// int level = hnsw->element_levels_[internal_id]; -// initializer->levels[i] = level; -// if (level > 0) { -// initializer->lists[i].assign(level * M, -1); -// for (int j = 1; j <= level; ++j) { -// int *edges = (int *)hnsw->get_linklist(internal_id, j); -// for (int k = 1; k <= edges[0]; ++k) { -// initializer->at(j, i, k - 1) = hnsw->getExternalLabel(edges[k]); -// } -// } +// } +// auto initializer = std::make_unique(nb, M); +// initializer->ep = hnsw->enterpoint_node_; +// for (int i = 0; i < nb; ++i) { +// int level = hnsw->element_levels_[i]; +// initializer->levels[i] = level; +// if (level > 0) { +// initializer->lists[i].assign(level * M, -1); +// for (int j = 1; j <= level; ++j) { +// int *edges = (int *)hnsw->get_linklist(i, j); +// for (int k = 1; k <= edges[0]; ++k) { +// initializer->at(j, i, k - 1) = edges[k]; // } +// } // } -// final_graph.initializer = std::move(initializer); +// } +// final_graph.initializer = std::move(initializer); + +#pragma omp parallel for + for (int i = 0; i < nb; ++i) { + auto internal_id = hnsw->label_lookup_[i]; + int *edges = (int *)hnsw->get_linklist0(internal_id); + for (int j = 1; j <= edges[0]; ++j) { + int external_id = hnsw->getExternalLabel(edges[j]); + final_graph.at(i, j - 1) = external_id; + } + } + auto initializer = std::make_unique(nb, M); + initializer->ep = hnsw->getExternalLabel(hnsw->enterpoint_node_); + for (int i = 0; i < nb; ++i) { + auto internal_id = hnsw->label_lookup_[i]; + int level = hnsw->element_levels_[internal_id]; + initializer->levels[i] = level; + if (level > 0) { + initializer->lists[i].assign(level * M, -1); + for (int j = 1; j <= level; ++j) { + int *edges = (int *)hnsw->get_linklist(internal_id, j); + for (int k = 1; k <= edges[0]; ++k) { + initializer->at(j, i, k - 1) = hnsw->getExternalLabel(edges[k]); + } + } + } + } + final_graph.initializer = std::move(initializer); } Graph GetGraph() override { return final_graph; } diff --git a/main.cpp b/main.cpp index e3a7207..853093a 100644 --- a/main.cpp +++ b/main.cpp @@ -53,10 +53,12 @@ void ReadBin(const std::string &file_path, int main() { - cout << sizeof(unsigned) << ", " << sizeof(int64_t) << endl; - string data_file = "/data/deep1b/base.1B.fbin.crop_nb_10000000"; - string query_file = "/data/deep1b/query.public.10K.fbin"; - string gt_file = "/data/deep1b/deep-10M"; +// string data_file = "/data/deep1b/base.1B.fbin.crop_nb_10000000"; +// string query_file = "/data/deep1b/query.public.10K.fbin"; +// string gt_file = "/data/deep1b/deep-10M"; + string data_file = "/home/weijian/workspace/deep1m/deep1m_base.fbin"; + string query_file = "/home/weijian/workspace/deep1m/deep1m_query.fbin"; + string gt_file = "/home/weijian/workspace/deep1m/deep1m_gt"; vector> data{}, queries{}; vector> GT{}; From 90196b6e1d13493a92c288932e53b6f19291b1a3 Mon Sep 17 00:00:00 2001 From: weijian Date: Tue, 14 May 2024 01:49:50 +0000 Subject: [PATCH 5/6] fix --- CMakeLists.txt | 2 +- glass/hnsw/hnsw.hpp | 24 --------- main.cpp | 115 -------------------------------------------- 3 files changed, 1 insertion(+), 140 deletions(-) delete mode 100644 main.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 6c67c74..de0a37b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,5 +10,5 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON) SET(CMAKE_CXX_FLAGS "-Wall -Wextra -O3 -lrt -std=c++20 -march=native -fpic -fopenmp -ftree-vectorize -fno-exceptions -fno-rtti" ) -add_executable(main main.cpp) +add_executable(main examples/main.cc) target_link_libraries(main glass) diff --git a/glass/hnsw/hnsw.hpp b/glass/hnsw/hnsw.hpp index 3329b35..3d27ebb 100644 --- a/glass/hnsw/hnsw.hpp +++ b/glass/hnsw/hnsw.hpp @@ -52,30 +52,6 @@ struct HNSW : public Builder { auto ela = std::chrono::duration(ed - st).count(); printf("HNSW building cost: %.2lfs\n", ela); final_graph.init(nb, 2 * M); -//#pragma omp parallel for -// for (int i = 0; i < nb; ++i) { -// int *edges = (int *)hnsw->get_linklist0(i); -// for (int j = 1; j <= edges[0]; ++j) { -// final_graph.at(i, j - 1) = edges[j]; -// } -// } -// auto initializer = std::make_unique(nb, M); -// initializer->ep = hnsw->enterpoint_node_; -// for (int i = 0; i < nb; ++i) { -// int level = hnsw->element_levels_[i]; -// initializer->levels[i] = level; -// if (level > 0) { -// initializer->lists[i].assign(level * M, -1); -// for (int j = 1; j <= level; ++j) { -// int *edges = (int *)hnsw->get_linklist(i, j); -// for (int k = 1; k <= edges[0]; ++k) { -// initializer->at(j, i, k - 1) = edges[k]; -// } -// } -// } -// } -// final_graph.initializer = std::move(initializer); - #pragma omp parallel for for (int i = 0; i < nb; ++i) { auto internal_id = hnsw->label_lookup_[i]; diff --git a/main.cpp b/main.cpp deleted file mode 100644 index 853093a..0000000 --- a/main.cpp +++ /dev/null @@ -1,115 +0,0 @@ -// -// Created by weijian on 5/13/24. -// - -#include -#include -#include -#include -#include -#include -#include - -#include "glass/searcher.hpp" -#include "glass/quant/fp32_quant.hpp" -#include "glass/hnsw/hnsw.hpp" - -using std::cout; -using std::endl; -using std::string; -using std::vector; - -/// @brief Reading binary data vectors. Raw data store as a (N x dim) -/// @param file_path file path of binary data -/// @param data returned 2D data vectors -template -void ReadBin(const std::string &file_path, - std::vector> &data) { - std::cout << "Reading Data: " << file_path << std::endl; - std::ifstream ifs; - ifs.open(file_path, std::ios::binary); - assert(ifs.is_open()); - unsigned N; // num of points - unsigned num_dimensions; - ifs.read((char *)&N, sizeof(unsigned)); - ifs.read((char *)&num_dimensions, sizeof(unsigned)); - data.resize(N); - std::cout << "# of points: " << N << std::endl; - std::cout << "# of dimensions: " << num_dimensions << std::endl; - std::vector buff(num_dimensions); - int counter = 0; - for(int i = 0; i < N; i++) { - ifs.read((char *)buff.data(), num_dimensions * sizeof(T)); - std::vector row(num_dimensions); - for (int d = 0; d < num_dimensions; d++) { - row[d] = static_cast(buff[d]); - } - data[counter++] = std::move(row); - } - ifs.close(); - std::cout << "Finish Reading Data" << endl; -} - - - -int main() { -// string data_file = "/data/deep1b/base.1B.fbin.crop_nb_10000000"; -// string query_file = "/data/deep1b/query.public.10K.fbin"; -// string gt_file = "/data/deep1b/deep-10M"; - string data_file = "/home/weijian/workspace/deep1m/deep1m_base.fbin"; - string query_file = "/home/weijian/workspace/deep1m/deep1m_query.fbin"; - string gt_file = "/home/weijian/workspace/deep1m/deep1m_gt"; - - vector> data{}, queries{}; - vector> GT{}; - vector data_buf{}; - - ReadBin(data_file, data); - ReadBin(query_file, queries); - ReadBin(gt_file, GT); - - int nb = data.size(), d = data.front().size(); - int nq = queries.size(); - int k = 100; - vector> output(nq, vector(k, 0)); - data_buf.resize(nb * d); -#pragma omp parallel for - for(int i = 0; i < nb; i++) { - std::memcpy(data_buf.data() + i * d, data[i].data(), d * 4); - } - - glass::HNSW index(d, "L2"); - index.Build(data_buf.data(), nb); - index.final_graph.save("hnsw_index_glass"); - - glass::Searcher> searcher(index.final_graph); - searcher.SetData(data_buf.data(), nb, d); - searcher.SetEf(500); - searcher.Optimize(96); - - auto start = std::chrono::high_resolution_clock::now(); -#pragma omp parallel for schedule(dynamic) - for(int i = 0; i < nq; i++) { - searcher.Search(queries[i].data(), k, output[i].data()); - } - auto end = std::chrono::high_resolution_clock::now(); - cout << "search time: " << std::chrono::duration(end - start).count() << " s" << std::endl; - - std::atomic total_coselection{0}; - std::atomic total_num{0}; -#pragma omp parallel for - for (int i = 0; i < nq; i++) { - int cur_coselection = 0; - std::set gt(GT[i].begin(), GT[i].end()); - std::set res(output[i].begin(), output[i].end()); - for (auto item: res) { - if (gt.find(static_cast(item)) != gt.end()) { - cur_coselection++; - } - } - total_num += 1; - total_coselection += cur_coselection; - } - - std::cout << "recall = " << (double) total_coselection * 100 / (total_num * 100) << " %" << std::endl; -} \ No newline at end of file From 04e37363a72326b8e325cb524d5199a0e69ede7a Mon Sep 17 00:00:00 2001 From: weijian Date: Tue, 14 May 2024 02:23:03 +0000 Subject: [PATCH 6/6] fix --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 068de5c..9785597 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,2 @@ build .cache -cmake-build-release