From 21207c8cd01e5c2dd5f454fcdaa18535e0e9186a Mon Sep 17 00:00:00 2001 From: Arne Juul Date: Fri, 22 Nov 2024 14:30:26 +0000 Subject: [PATCH] cleanup: GC old ANN experiments --- eval/CMakeLists.txt | 1 - eval/src/tests/ann/.gitignore | 2 - eval/src/tests/ann/CMakeLists.txt | 31 -- eval/src/tests/ann/bruteforce-nns.h | 74 --- eval/src/tests/ann/doc_vector_access.h | 13 - eval/src/tests/ann/extended-hnsw.cpp | 636 ------------------------- eval/src/tests/ann/find-with-nns.h | 12 - eval/src/tests/ann/for-sift-hit.h | 10 - eval/src/tests/ann/for-sift-top-k.h | 26 - eval/src/tests/ann/gist_benchmark.cpp | 143 ------ eval/src/tests/ann/hnsw-like.h | 203 -------- eval/src/tests/ann/nns-l2.h | 68 --- eval/src/tests/ann/nns.h | 99 ---- eval/src/tests/ann/point-vector.h | 30 -- eval/src/tests/ann/quality-nns.h | 42 -- eval/src/tests/ann/read-vecs.h | 45 -- eval/src/tests/ann/remove-bm.cpp | 248 ---------- eval/src/tests/ann/sift_benchmark.cpp | 256 ---------- eval/src/tests/ann/std-random.h | 21 - eval/src/tests/ann/time-util.h | 9 - eval/src/tests/ann/verify-top-k.h | 27 -- eval/src/tests/ann/xp-annoy-nns.cpp | 480 ------------------- eval/src/tests/ann/xp-hnsw-wrap.cpp | 84 ---- eval/src/tests/ann/xp-hnswlike-nns.cpp | 544 --------------------- eval/src/tests/ann/xp-lsh-nns.cpp | 269 ----------- 25 files changed, 3373 deletions(-) delete mode 100644 eval/src/tests/ann/.gitignore delete mode 100644 eval/src/tests/ann/CMakeLists.txt delete mode 100644 eval/src/tests/ann/bruteforce-nns.h delete mode 100644 eval/src/tests/ann/doc_vector_access.h delete mode 100644 eval/src/tests/ann/extended-hnsw.cpp delete mode 100644 eval/src/tests/ann/find-with-nns.h delete mode 100644 eval/src/tests/ann/for-sift-hit.h delete mode 100644 eval/src/tests/ann/for-sift-top-k.h delete mode 100644 eval/src/tests/ann/gist_benchmark.cpp delete mode 100644 eval/src/tests/ann/hnsw-like.h delete mode 100644 eval/src/tests/ann/nns-l2.h delete mode 100644 eval/src/tests/ann/nns.h delete mode 100644 eval/src/tests/ann/point-vector.h delete mode 100644 eval/src/tests/ann/quality-nns.h delete mode 100644 eval/src/tests/ann/read-vecs.h delete mode 100644 eval/src/tests/ann/remove-bm.cpp delete mode 100644 eval/src/tests/ann/sift_benchmark.cpp delete mode 100644 eval/src/tests/ann/std-random.h delete mode 100644 eval/src/tests/ann/time-util.h delete mode 100644 eval/src/tests/ann/verify-top-k.h delete mode 100644 eval/src/tests/ann/xp-annoy-nns.cpp delete mode 100644 eval/src/tests/ann/xp-hnsw-wrap.cpp delete mode 100644 eval/src/tests/ann/xp-hnswlike-nns.cpp delete mode 100644 eval/src/tests/ann/xp-lsh-nns.cpp diff --git a/eval/CMakeLists.txt b/eval/CMakeLists.txt index 911af81e754d..12e35e6fb3c8 100644 --- a/eval/CMakeLists.txt +++ b/eval/CMakeLists.txt @@ -10,7 +10,6 @@ vespa_define_module( src/apps/tensor_conformance TESTS - src/tests/ann src/tests/apps/analyze_onnx_model src/tests/apps/eval_expr src/tests/eval/addr_to_symbol diff --git a/eval/src/tests/ann/.gitignore b/eval/src/tests/ann/.gitignore deleted file mode 100644 index 9249517f4b38..000000000000 --- a/eval/src/tests/ann/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -*_app -log* diff --git a/eval/src/tests/ann/CMakeLists.txt b/eval/src/tests/ann/CMakeLists.txt deleted file mode 100644 index 71a4f6a3310a..000000000000 --- a/eval/src/tests/ann/CMakeLists.txt +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -vespa_add_executable(eval_sift_benchmark_app - SOURCES - sift_benchmark.cpp - xp-annoy-nns.cpp - xp-hnswlike-nns.cpp - xp-lsh-nns.cpp - DEPENDS - vespaeval -) - -vespa_add_executable(eval_gist_benchmark_app - SOURCES - gist_benchmark.cpp - xp-annoy-nns.cpp - extended-hnsw.cpp - xp-lsh-nns.cpp - DEPENDS - vespaeval -) - -vespa_add_executable(eval_remove_bm_app - SOURCES - remove-bm.cpp - xp-annoy-nns.cpp - xp-hnswlike-nns.cpp - xp-lsh-nns.cpp - DEPENDS - vespaeval -) diff --git a/eval/src/tests/ann/bruteforce-nns.h b/eval/src/tests/ann/bruteforce-nns.h deleted file mode 100644 index e3cbeecb4877..000000000000 --- a/eval/src/tests/ann/bruteforce-nns.h +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -std::vector bruteforceResults; - -double computeDistance(const PointVector &query, uint32_t docid) { - const PointVector &docvector = generatedDocs[docid]; - return l2distCalc.l2sq_dist(query, docvector); -} - -struct BfHitComparator { - bool operator() (const Hit &lhs, const Hit& rhs) const { - if (lhs.distance < rhs.distance) return false; - if (lhs.distance > rhs.distance) return true; - return (lhs.docid > rhs.docid); - } -}; - -class BfHitHeap { -private: - size_t _size; - vespalib::PriorityQueue _priQ; -public: - explicit BfHitHeap(size_t maxSize) : _size(maxSize), _priQ() { - _priQ.reserve(maxSize); - } - ~BfHitHeap() {} - void maybe_use(const Hit &hit) { - if (_priQ.size() < _size) { - _priQ.push(hit); - } else if (hit.distance < _priQ.front().distance) { - _priQ.front() = hit; - _priQ.adjust(); - } - } - std::vector bestHits() { - std::vector result; - size_t i = _priQ.size(); - result.resize(i); - while (i-- > 0) { - result[i] = _priQ.front(); - _priQ.pop_front(); - } - return result; - } -}; - -TopK bruteforce_nns(const PointVector &query) { - TopK result; - BfHitHeap heap(result.K); - for (uint32_t docid = 0; docid < EFFECTIVE_DOCS; ++docid) { - const PointVector &docvector = generatedDocs[docid]; - double d = l2distCalc.l2sq_dist(query, docvector); - Hit h(docid, d); - heap.maybe_use(h); - } - std::vector best = heap.bestHits(); - for (size_t i = 0; i < result.K; ++i) { - result.hits[i] = best[i]; - } - return result; -} - -void verifyBF(uint32_t qid) { - const PointVector &query = generatedQueries[qid]; - TopK &result = bruteforceResults[qid]; - double min_distance = result.hits[0].distance; - for (uint32_t i = 0; i < EFFECTIVE_DOCS; ++i) { - double dist = computeDistance(query, i); - if (dist < min_distance) { - fprintf(stderr, "WARN dist %.9g < mindist %.9g\n", dist, min_distance); - } - EXPECT_FALSE(dist+0.000001 < min_distance); - } -} diff --git a/eval/src/tests/ann/doc_vector_access.h b/eval/src/tests/ann/doc_vector_access.h deleted file mode 100644 index 0a23951ca274..000000000000 --- a/eval/src/tests/ann/doc_vector_access.h +++ /dev/null @@ -1,13 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include -#include - -template -struct DocVectorAccess -{ - virtual std::span get(uint32_t docid) const = 0; - virtual ~DocVectorAccess() = default; -}; diff --git a/eval/src/tests/ann/extended-hnsw.cpp b/eval/src/tests/ann/extended-hnsw.cpp deleted file mode 100644 index 2245c959dbbb..000000000000 --- a/eval/src/tests/ann/extended-hnsw.cpp +++ /dev/null @@ -1,636 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "hnsw-like.h" - -static size_t distcalls_simple; -static size_t distcalls_search_layer; -static size_t distcalls_other; -static size_t distcalls_heuristic; -static size_t distcalls_shrink; -static size_t distcalls_refill; -static size_t refill_needed_calls; -static size_t shrink_needed_calls; -static size_t disconnected_weak_links; -static size_t disconnected_for_symmetry; -static size_t select_n_full; -static size_t select_n_partial; - - -HnswLikeNns::HnswLikeNns(uint32_t numDims, const DocVectorAccess &dva) - : NNS(numDims, dva), - _nodes(), - _entryId(0), - _entryLevel(-1), - _M(16), - _efConstruction(200), - _levelMultiplier(1.0 / log(1.0 * _M)), - _rndGen(), - _ops_counter(0) -{ -} - -// simple greedy search -HnswHit -HnswLikeNns::search_layer_simple(Vector vector, HnswHit curPoint, uint32_t searchLevel) { - bool keepGoing = true; - while (keepGoing) { - keepGoing = false; - const LinkList& neighbors = getLinkList(curPoint.docid, searchLevel); - for (uint32_t n_id : neighbors) { - double dist = distance(vector, n_id); - ++distcalls_simple; - if (dist < curPoint.dist) { - curPoint = HnswHit(n_id, SqDist(dist)); - keepGoing = true; - } - } - } - return curPoint; -} - - -bool -HnswLikeNns::haveCloserDistance(HnswHit e, const LinkList &r) const { - for (uint32_t prevId : r) { - double dist = distance(e.docid, prevId); - ++distcalls_heuristic; - if (dist < e.dist) return true; - } - return false; -} - -void -HnswLikeNns::addDoc(uint32_t docid) { - Vector vector = _dva.get(docid); - for (uint32_t id = _nodes.size(); id <= docid; ++id) { - _nodes.emplace_back(id, 0, _M); - } - int level = randomLevel(); - assert(_nodes[docid]._links.size() == 0); - _nodes[docid] = Node(docid, level+1, _M); - if (_entryLevel < 0) { - _entryId = docid; - _entryLevel = level; - track_ops(); - return; - } - int searchLevel = _entryLevel; - VisitedSet &visited = _visitedSetPool.get(_nodes.size()); - double entryDist = distance(vector, _entryId); - ++distcalls_other; - HnswHit entryPoint(_entryId, SqDist(entryDist)); -#undef MULTI_ENTRY_I -#ifdef MULTI_ENTRY_I - FurthestPriQ w; - w.push(entryPoint); - while (searchLevel > level) { - search_layer(vector, w, visited, 5 * _M, searchLevel); - --searchLevel; - } -#else - while (searchLevel > level) { - entryPoint = search_layer_simple(vector, entryPoint, searchLevel); - --searchLevel; - } - FurthestPriQ w; - w.push(entryPoint); -#endif - searchLevel = std::min(level, _entryLevel); - while (searchLevel >= 0) { - search_layer(vector, w, visited, _efConstruction, searchLevel); - LinkList neighbors = select_neighbors(w.peek(), _M); - connect_new_node(docid, neighbors, searchLevel); - each_shrink_ifneeded(neighbors, searchLevel); - --searchLevel; - } - if (level > _entryLevel) { - _entryLevel = level; - _entryId = docid; - } - track_ops(); -} - -void -HnswLikeNns::track_ops() { - _ops_counter++; - if ((_ops_counter % 10000) == 0) { - double div = _ops_counter; - fprintf(stderr, "add / remove ops: %zu\n", _ops_counter); - fprintf(stderr, "distance calls for layer: %zu is %.3f per op\n", distcalls_search_layer, distcalls_search_layer/ div); - fprintf(stderr, "distance calls for heuristic: %zu is %.3f per op\n", distcalls_heuristic, distcalls_heuristic / div); - fprintf(stderr, "distance calls for simple: %zu is %.3f per op\n", distcalls_simple, distcalls_simple / div); - fprintf(stderr, "distance calls for shrink: %zu is %.3f per op\n", distcalls_shrink, distcalls_shrink / div); - fprintf(stderr, "distance calls for refill: %zu is %.3f per op\n", distcalls_refill, distcalls_refill / div); - fprintf(stderr, "distance calls for other: %zu is %.3f per op\n", distcalls_other, distcalls_other / div); - fprintf(stderr, "refill needed calls: %zu is %.3f per op\n", refill_needed_calls, refill_needed_calls / div); - fprintf(stderr, "shrink needed calls: %zu is %.3f per op\n", shrink_needed_calls, shrink_needed_calls / div); - fprintf(stderr, "disconnected weak links: %zu is %.3f per op\n", disconnected_weak_links, disconnected_weak_links / div); - fprintf(stderr, "disconnected for symmetry: %zu is %.3f per op\n", disconnected_for_symmetry, disconnected_for_symmetry / div); - fprintf(stderr, "select neighbors: partial %zu vs full %zu\n", select_n_partial, select_n_full); - } -} - -#ifdef SIMPLE_REFILL -void -HnswLikeNns::refill_ifneeded(uint32_t my_id, const LinkList &replacements, uint32_t level) { - LinkList &my_links = getLinkList(my_id, level); - if (my_links.size() * 2 < _M) { - const uint32_t maxLinks = (level > 0) ? _M : (2 * _M); - ++refill_needed_calls; - for (uint32_t repl_id : replacements) { - if (repl_id == my_id) continue; - if (my_links.has_link_to(repl_id)) continue; - LinkList &other_links = getLinkList(repl_id, level); - if (other_links.size() >= maxLinks) continue; - other_links.push_back(my_id); - my_links.push_back(repl_id); - if (my_links.size() >= _M) return; - } - } -} -#endif - -#define REFILL_ALL -#ifdef REFILL_ALL -void -HnswLikeNns::refill_ifneeded(uint32_t my_id, const LinkList &replacements, uint32_t level) { - LinkList &my_links = getLinkList(my_id, level); - if (my_links.size() >= _M) return; - ++refill_needed_calls; - const uint32_t maxLinks = (level > 0) ? _M : (2 * _M); - NearestPriQ w; - for (uint32_t repl_id : replacements) { - if (repl_id == my_id) continue; - if (my_links.has_link_to(repl_id)) continue; - const LinkList &other_links = getLinkList(repl_id, level); - if (other_links.size() >= maxLinks) continue; - double dist = distance(my_id, repl_id); - ++distcalls_refill; - w.emplace(repl_id, SqDist(dist)); - } - while (! w.empty()) { - HnswHit e = w.top(); - w.pop(); - if (haveCloserDistance(e, my_links)) continue; - LinkList &other_links = getLinkList(e.docid, level); - my_links.push_back(e.docid); - other_links.push_back(my_id); - if (my_links.size() == _M) break; - } -} -#endif - -#ifdef REFILL_ONE -void -HnswLikeNns::refill_ifneeded(uint32_t my_id, const LinkList &replacements, uint32_t level) { - LinkList &my_links = getLinkList(my_id, level); - if (my_links.size() >= _M) return; - ++refill_needed_calls; - NearestPriQ w; - for (uint32_t repl_id : replacements) { - if (repl_id == my_id) continue; - if (my_links.has_link_to(repl_id)) continue; - LinkList &other_links = getLinkList(repl_id, level); - if (other_links.size() >= _M) continue; - double dist = distance(my_id, repl_id); - ++distcalls_refill; - w.emplace(repl_id, SqDist(dist)); - } - while (! w.empty()) { - HnswHit e = w.top(); - w.pop(); - if (haveCloserDistance(e, my_links)) continue; - LinkList &other_links = getLinkList(e.docid, level); - my_links.push_back(e.docid); - other_links.push_back(my_id); - return; - } -} -#endif - -void -HnswLikeNns::shrink_links(uint32_t shrink_id, uint32_t maxLinks, uint32_t level) { - LinkList &links = getLinkList(shrink_id, level); - NearestList distances; - for (uint32_t n_id : links) { - double n_dist = distance(shrink_id, n_id); - ++distcalls_shrink; - distances.emplace_back(n_id, SqDist(n_dist)); - } - LinkList lostLinks; - LinkList oldLinks = links; - links = remove_weakest(distances, maxLinks, lostLinks); -#define KEEP_SYM -#ifdef KEEP_SYM - for (uint32_t lost_id : lostLinks) { - ++disconnected_for_symmetry; - remove_link_from(lost_id, shrink_id, level); - } -#define DO_REFILL_AFTER_KEEP_SYM -#ifdef DO_REFILL_AFTER_KEEP_SYM - for (uint32_t lost_id : lostLinks) { - refill_ifneeded(lost_id, oldLinks, level); - } -#endif -#endif -} - - -void -HnswLikeNns::removeDoc(uint32_t docid) { - Node &node = _nodes[docid]; - bool need_new_entrypoint = (docid == _entryId); - for (int level = node._links.size(); level-- > 0; ) { - LinkList my_links; - my_links.swap(node._links[level]); - for (uint32_t n_id : my_links) { - if (need_new_entrypoint) { - _entryId = n_id; - _entryLevel = level; - need_new_entrypoint = false; - } - remove_link_from(n_id, docid, level); - } - while (! my_links.empty()) { - uint32_t n_id = my_links.back(); - my_links.pop_back(); - refill_ifneeded(n_id, my_links, level); - } - } - node = Node(docid, 0, _M); - if (need_new_entrypoint) { - _entryLevel = -1; - _entryId = 0; - for (uint32_t i = 0; i < _nodes.size(); ++i) { - if (_nodes[i]._links.size() > 0) { - _entryId = i; - _entryLevel = _nodes[i]._links.size() - 1; - break; - } - } - } - track_ops(); -} - -std::vector -HnswLikeNns::topK(uint32_t k, Vector vector, uint32_t search_k) { - std::vector result; - if (_entryLevel < 0) return result; - double entryDist = distance(vector, _entryId); - ++distcalls_other; - HnswHit entryPoint(_entryId, SqDist(entryDist)); - int searchLevel = _entryLevel; - VisitedSet &visited = _visitedSetPool.get(_nodes.size()); -#undef MULTI_ENTRY_S -#ifdef MULTI_ENTRY_S - FurthestPriQ w; - w.push(entryPoint); - while (searchLevel > 0) { - search_layer(vector, w, visited, std::min(k, search_k), searchLevel); - --searchLevel; - } -#else - while (searchLevel > 0) { - entryPoint = search_layer_simple(vector, entryPoint, searchLevel); - --searchLevel; - } - FurthestPriQ w; - w.push(entryPoint); -#endif - search_layer(vector, w, visited, std::max(k, search_k), 0); - while (w.size() > k) { - w.pop(); - } - NearestList tmp = w.steal(); - std::sort(tmp.begin(), tmp.end(), LesserDist()); - result.reserve(tmp.size()); - for (const auto & hit : tmp) { - result.emplace_back(hit.docid, SqDist(hit.dist)); - } - return result; -} - - -double -HnswLikeNns::distance(Vector v, uint32_t b) const -{ - Vector w = _dva.get(b); - return l2distCalc.l2sq_dist(v, w); -} - -std::vector -HnswLikeNns::topKfilter(uint32_t k, Vector vector, uint32_t search_k, const BitVector &skipDocIds) -{ - std::vector result; - if (_entryLevel < 0) return result; - double entryDist = distance(vector, _entryId); - ++distcalls_other; - HnswHit entryPoint(_entryId, SqDist(entryDist)); - int searchLevel = _entryLevel; - VisitedSet &visited = _visitedSetPool.get(_nodes.size()); -#ifdef MULTI_ENTRY_S - FurthestPriQ w; - w.push(entryPoint); - while (searchLevel > 0) { - search_layer(vector, w, visited, std::min(k, search_k), searchLevel); - --searchLevel; - } -#else - while (searchLevel > 0) { - entryPoint = search_layer_simple(vector, entryPoint, searchLevel); - --searchLevel; - } - FurthestPriQ w; - w.push(entryPoint); -#endif - search_layer_with_filter(vector, w, visited, std::max(k, search_k), 0, skipDocIds); - NearestList tmp = w.steal(); - std::sort(tmp.begin(), tmp.end(), LesserDist()); - result.reserve(std::min((size_t)k, tmp.size())); - for (const auto & hit : tmp) { - if (skipDocIds.isSet(hit.docid)) continue; - result.emplace_back(hit.docid, SqDist(hit.dist)); - if (result.size() == k) break; - } - return result; -} - -void -HnswLikeNns::each_shrink_ifneeded(const LinkList &neighbors, uint32_t level) { - uint32_t maxLinks = (level > 0) ? _M : (2 * _M); - for (uint32_t old_id : neighbors) { - LinkList &oldLinks = getLinkList(old_id, level); - if (oldLinks.size() > maxLinks) { - ++shrink_needed_calls; - shrink_links(old_id, maxLinks, level); - } - } -} - -void -HnswLikeNns::search_layer(Vector vector, FurthestPriQ &w, - VisitedSet &visited, - uint32_t ef, uint32_t searchLevel) -{ - NearestPriQ candidates; - - for (const HnswHit & entry : w.peek()) { - candidates.push(entry); - visited.mark(entry.docid); - } - double limd = std::numeric_limits::max(); - while (! candidates.empty()) { - HnswHit cand = candidates.top(); - if (cand.dist > limd) { - break; - } - candidates.pop(); - for (uint32_t e_id : getLinkList(cand.docid, searchLevel)) { - if (visited.isMarked(e_id)) continue; - visited.mark(e_id); - double e_dist = distance(vector, e_id); - ++distcalls_search_layer; - if (e_dist < limd) { - candidates.emplace(e_id, SqDist(e_dist)); - w.emplace(e_id, SqDist(e_dist)); - if (w.size() > ef) { - w.pop(); - limd = w.top().dist; - } - } - } - } - return; -} - -void -HnswLikeNns::search_layer_with_filter(Vector vector, FurthestPriQ &w, - VisitedSet &visited, - uint32_t ef, uint32_t searchLevel, - const BitVector &skipDocIds) -{ - NearestPriQ candidates; - - for (const HnswHit & entry : w.peek()) { - candidates.push(entry); - visited.mark(entry.docid); - if (skipDocIds.isSet(entry.docid)) ++ef; - } - double limd = std::numeric_limits::max(); - while (! candidates.empty()) { - HnswHit cand = candidates.top(); - if (cand.dist > limd) { - break; - } - candidates.pop(); - for (uint32_t e_id : getLinkList(cand.docid, searchLevel)) { - if (visited.isMarked(e_id)) continue; - visited.mark(e_id); - double e_dist = distance(vector, e_id); - ++distcalls_search_layer; - if (e_dist < limd) { - candidates.emplace(e_id, SqDist(e_dist)); - if (skipDocIds.isSet(e_id)) continue; - w.emplace(e_id, SqDist(e_dist)); - if (w.size() > ef) { - w.pop(); - limd = w.top().dist; - } - } - } - } -} - -LinkList -HnswLikeNns::remove_weakest(const NearestList &neighbors, uint32_t curMax, LinkList &lost) const -{ - LinkList result; - result.reserve(curMax+1); - NearestPriQ w; - for (const auto & entry : neighbors) { - w.push(entry); - } - while (! w.empty()) { - HnswHit e = w.top(); - w.pop(); - if (result.size() == curMax || haveCloserDistance(e, result)) { - lost.push_back(e.docid); - } else { - result.push_back(e.docid); - } - } - return result; -} - -#define NO_BACKFILL -#ifdef NO_BACKFILL -LinkList -HnswLikeNns::select_neighbors(const NearestList &neighbors, uint32_t curMax) const -{ - LinkList result; - result.reserve(curMax+1); - NearestPriQ w; - for (const auto & entry : neighbors) { - w.push(entry); - } - while (! w.empty()) { - HnswHit e = w.top(); - w.pop(); - if (haveCloserDistance(e, result)) { - continue; - } - result.push_back(e.docid); - if (result.size() == curMax) { - ++select_n_full; - return result; - } - } - ++select_n_partial; - return result; -} -#else -LinkList -HnswLikeNns::select_neighbors(const NearestList &neighbors, uint32_t curMax) const -{ - LinkList result; - result.reserve(curMax+1); - bool needFiltering = (neighbors.size() > curMax); - NearestPriQ w; - for (const auto & entry : neighbors) { - w.push(entry); - } - LinkList backfill; - while (! w.empty()) { - HnswHit e = w.top(); - w.pop(); - if (needFiltering && haveCloserDistance(e, result)) { - backfill.push_back(e.docid); - continue; - } - result.push_back(e.docid); - if (result.size() == curMax) return result; - } - if (result.size() * 4 < _M) { - for (uint32_t fill_id : backfill) { - result.push_back(fill_id); - if (result.size() * 2 >= _M) break; - } - } - return result; -} -#endif - -void -HnswLikeNns::connect_new_node(uint32_t id, const LinkList &neighbors, uint32_t level) { - LinkList &newLinks = getLinkList(id, level); - for (uint32_t neigh_id : neighbors) { - LinkList &oldLinks = getLinkList(neigh_id, level); - newLinks.push_back(neigh_id); - oldLinks.push_back(id); - } -#define DISCONNECT_OLD_WEAK_LINKS -#ifdef DISCONNECT_OLD_WEAK_LINKS - for (uint32_t i = 1; i < neighbors.size(); ++i) { - uint32_t n_1 = neighbors[i]; - LinkList &links_1 = getLinkList(n_1, level); - for (uint32_t j = 0; j < i; ++j) { - uint32_t n_2 = neighbors[j]; - if (links_1.has_link_to(n_2)) { - ++disconnected_weak_links; - LinkList &links_2 = getLinkList(n_2, level); - links_1.remove_link(n_2); - links_2.remove_link(n_1); - } - } - } -#endif -} - -uint32_t -HnswLikeNns::count_reachable() const { - VisitedSet visited(_nodes.size()); - int level = _entryLevel; - LinkList curList; - curList.push_back(_entryId); - visited.mark(_entryId); - uint32_t idx = 0; - while (level >= 0) { - while (idx < curList.size()) { - uint32_t id = curList[idx++]; - const LinkList &links = getLinkList(id, level); - for (uint32_t n_id : links) { - if (visited.isMarked(n_id)) continue; - visited.mark(n_id); - curList.push_back(n_id); - } - } - --level; - idx = 0; - } - return curList.size(); -} - -void -HnswLikeNns::dumpStats() const { - std::vector levelCounts; - levelCounts.resize(_entryLevel + 2); - std::vector outLinkHist; - outLinkHist.resize(2 * _M + 2); - uint32_t symmetrics = 0; - uint32_t level1links = 0; - uint32_t both_l_links = 0; - fprintf(stderr, "stats for HnswLikeNns with %zu nodes, entry level = %d, entry id = %u\n", - _nodes.size(), _entryLevel, _entryId); - - for (uint32_t id = 0; id < _nodes.size(); ++id) { - const auto &node = _nodes[id]; - uint32_t levels = node._links.size(); - levelCounts[levels]++; - if (levels < 1) { - outLinkHist[0]++; - continue; - } - const LinkList &link_list = getLinkList(id, 0); - uint32_t numlinks = link_list.size(); - outLinkHist[numlinks]++; - if (numlinks < 1) { - fprintf(stderr, "node with %u links: id %u\n", numlinks, id); - } - bool all_sym = true; - for (uint32_t n_id : link_list) { - const LinkList &neigh_list = getLinkList(n_id, 0); - if (! neigh_list.has_link_to(id)) { -#ifdef KEEP_SYM - fprintf(stderr, "BAD: %u has link to neighbor %u, but backlink is missing\n", id, n_id); -#endif - all_sym = false; - } - } - if (all_sym) ++symmetrics; - if (levels < 2) continue; - const LinkList &link_list_1 = getLinkList(id, 1); - for (uint32_t n_id : link_list_1) { - ++level1links; - if (link_list.has_link_to(n_id)) ++both_l_links; - } - } - for (uint32_t l = 0; l < levelCounts.size(); ++l) { - fprintf(stderr, "Nodes on %u levels: %u\n", l, levelCounts[l]); - } - fprintf(stderr, "reachable nodes %u / %zu\n", - count_reachable(), _nodes.size() - levelCounts[0]); - fprintf(stderr, "level 1 links overlapping on l0: %u / total: %u\n", - both_l_links, level1links); - for (uint32_t l = 0; l < outLinkHist.size(); ++l) { - if (outLinkHist[l] != 0) { - fprintf(stderr, "Nodes with %u outward links on L0: %u\n", l, outLinkHist[l]); - } - } - fprintf(stderr, "Symmetric in-out nodes: %u\n", symmetrics); -} - -std::unique_ptr> -make_hnsw_nns(uint32_t numDims, const DocVectorAccess &dva) -{ - return std::make_unique(numDims, dva); -} diff --git a/eval/src/tests/ann/find-with-nns.h b/eval/src/tests/ann/find-with-nns.h deleted file mode 100644 index e13080b2eeff..000000000000 --- a/eval/src/tests/ann/find-with-nns.h +++ /dev/null @@ -1,12 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -TopK find_with_nns(uint32_t sk, NNS_API &nns, uint32_t qid) { - TopK result; - const PointVector &qv = generatedQueries[qid]; - std::span query(qv.v, NUM_DIMS); - auto rv = nns.topK(result.K, query, sk); - for (size_t i = 0; i < result.K; ++i) { - result.hits[i] = Hit(rv[i].docid, rv[i].sq.distance); - } - return result; -} diff --git a/eval/src/tests/ann/for-sift-hit.h b/eval/src/tests/ann/for-sift-hit.h deleted file mode 100644 index bb0e6f7e6a15..000000000000 --- a/eval/src/tests/ann/for-sift-hit.h +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -struct Hit { - uint32_t docid; - double distance; - Hit() noexcept : docid(0u), distance(0.0) {} - Hit(int id, double dist) : docid(id), distance(dist) {} -}; diff --git a/eval/src/tests/ann/for-sift-top-k.h b/eval/src/tests/ann/for-sift-top-k.h deleted file mode 100644 index 810b436ba06b..000000000000 --- a/eval/src/tests/ann/for-sift-top-k.h +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -struct TopK { - static constexpr size_t K = 100; - Hit hits[K]; - - size_t recall(const TopK &other) const { - size_t overlap = 0; - size_t i = 0; - size_t j = 0; - while (i < K && j < K) { - if (hits[i].docid == other.hits[j].docid) { - ++overlap; - ++i; - ++j; - } else if (hits[i].distance < other.hits[j].distance) { - ++i; - } else { - ++j; - } - } - return overlap; - } -}; diff --git a/eval/src/tests/ann/gist_benchmark.cpp b/eval/src/tests/ann/gist_benchmark.cpp deleted file mode 100644 index 207f52e80744..000000000000 --- a/eval/src/tests/ann/gist_benchmark.cpp +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include -#include -#include -#include -#include -#include -#include -#include - -#define NUM_DIMS 960 -#define NUM_DOCS 200000 -#define EFFECTIVE_DOCS NUM_DOCS -#define NUM_REACH 10000 -#define NUM_Q 1000 - -#include "doc_vector_access.h" -#include "nns.h" -#include "for-sift-hit.h" -#include "for-sift-top-k.h" -#include "time-util.h" -#include "point-vector.h" -#include "read-vecs.h" -#include "bruteforce-nns.h" - -using NNS_API = NNS; - -TEST("require that brute force works") { - TimePoint bef = std::chrono::steady_clock::now(); - fprintf(stderr, "generating %u brute force results\n", NUM_Q); - bruteforceResults.reserve(NUM_Q); - for (uint32_t cnt = 0; cnt < NUM_Q; ++cnt) { - const PointVector &query = generatedQueries[cnt]; - bruteforceResults.emplace_back(bruteforce_nns(query)); - } - TimePoint aft = std::chrono::steady_clock::now(); - fprintf(stderr, "timing for brute force: %.3f ms = %.3f ms per query\n", - to_ms(aft - bef), to_ms(aft - bef)/NUM_Q); - for (int cnt = 0; cnt < NUM_Q; cnt = (cnt+1)*2) { - verifyBF(cnt); - } -} - -#include "find-with-nns.h" -#include "verify-top-k.h" - -void timing_nns(const char *name, NNS_API &nns, std::vector sk_list) { - for (uint32_t search_k : sk_list) { - TimePoint bef = std::chrono::steady_clock::now(); - for (int cnt = 0; cnt < NUM_Q; ++cnt) { - find_with_nns(search_k, nns, cnt); - } - TimePoint aft = std::chrono::steady_clock::now(); - fprintf(stderr, "timing for %s search_k=%u: %.3f ms = %.3f ms/q\n", - name, search_k, to_ms(aft - bef), to_ms(aft - bef)/NUM_Q); - } -} - -#include "quality-nns.h" - -template -void bm_nns_simple(const char *name, FUNC creator, std::vector sk_list) { - std::unique_ptr nnsp = creator(); - NNS_API &nns = *nnsp; - fprintf(stderr, "trying %s indexing...\n", name); - TimePoint bef = std::chrono::steady_clock::now(); - for (uint32_t i = 0; i < NUM_DOCS; ++i) { - nns.addDoc(i); - } - TimePoint aft = std::chrono::steady_clock::now(); - fprintf(stderr, "build %s index with %u docs: %.3f ms\n", name, NUM_DOCS, to_ms(aft - bef)); - timing_nns(name, nns, sk_list); - fprintf(stderr, "Quality for %s [A] clean build with %u documents:\n", name, NUM_DOCS); - quality_nns(nns, sk_list); -} - -template -void benchmark_nns(const char *name, FUNC creator, std::vector sk_list) { - bm_nns_simple(name, creator, sk_list); -} - -#if 0 -TEST("require that Locality Sensitive Hashing mostly works") { - DocVectorAdapter adapter; - auto creator = [&adapter]() { return make_rplsh_nns(NUM_DIMS, adapter); }; - benchmark_nns("RPLSH", creator, { 200, 1000 }); -} -#endif - -#if 0 -TEST("require that Annoy via NNS api mostly works") { - DocVectorAdapter adapter; - auto creator = [&adapter]() { return make_annoy_nns(NUM_DIMS, adapter); }; - benchmark_nns("Annoy", creator, { 8000, 10000 }); -} -#endif - -#if 1 -TEST("require that HNSW via NNS api mostly works") { - DocVectorAdapter adapter; - auto creator = [&adapter]() { return make_hnsw_nns(NUM_DIMS, adapter); }; - benchmark_nns("HNSW-like", creator, { 100, 150, 200 }); -} -#endif - -#if 0 -TEST("require that HNSW wrapped api mostly works") { - DocVectorAdapter adapter; - auto creator = [&adapter]() { return make_hnsw_wrap(NUM_DIMS, adapter); }; - benchmark_nns("HNSW-wrap", creator, { 100, 150, 200 }); -} -#endif - -/** - * Before running the benchmark the ANN_GIST1M data set must be downloaded and extracted: - * wget ftp://ftp.irisa.fr/local/texmex/corpus/gist.tar.gz - * tar -xf gist.tar.gz - * - * The benchmark program will load the data set from $HOME/gist if no directory is specified. - * - * More information about the dataset is found here: http://corpus-texmex.irisa.fr/. - */ -int main(int argc, char **argv) { - TEST_MASTER.init(__FILE__); - std::string data_set = "gist"; - std::string data_dir = "."; - if (argc > 2) { - data_set = argv[1]; - data_dir = argv[2]; - } else if (argc > 1) { - data_dir = argv[1]; - } else { - char *home = getenv("HOME"); - if (home) { - data_dir = home; - data_dir += "/" + data_set; - } - } - read_data(data_dir, data_set); - TEST_RUN_ALL(); - return (TEST_MASTER.fini() ? 0 : 1); -} diff --git a/eval/src/tests/ann/hnsw-like.h b/eval/src/tests/ann/hnsw-like.h deleted file mode 100644 index 9ad8fcb51aff..000000000000 --- a/eval/src/tests/ann/hnsw-like.h +++ /dev/null @@ -1,203 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include -#include -#include -#include -#include "std-random.h" -#include "nns.h" - -struct LinkList : std::vector -{ - bool has_link_to(uint32_t id) const { - auto iter = std::find(begin(), end(), id); - return (iter != end()); - } - void remove_link(uint32_t id) { - uint32_t last = back(); - for (iterator iter = begin(); iter != end(); ++iter) { - if (*iter == id) { - *iter = last; - pop_back(); - return; - } - } - fprintf(stderr, "BAD missing link to remove: %u\n", id); - abort(); - } -}; - -struct Node { - std::vector _links; - Node(uint32_t , uint32_t numLevels, uint32_t M) - : _links(numLevels) - { - for (uint32_t i = 0; i < _links.size(); ++i) { - _links[i].reserve((i == 0) ? (2 * M + 1) : (M+1)); - } - } -}; - -struct VisitedSet -{ - using Mark = unsigned short; - Mark *ptr; - Mark curval; - size_t sz; - VisitedSet(const VisitedSet &) = delete; - VisitedSet& operator=(const VisitedSet &) = delete; - explicit VisitedSet(size_t size) { - ptr = (Mark *)malloc(size * sizeof(Mark)); - curval = -1; - sz = size; - clear(); - } - void clear() { - ++curval; - if (curval == 0) { - memset(ptr, 0, sz * sizeof(Mark)); - ++curval; - } - } - ~VisitedSet() { free(ptr); } - void mark(size_t id) { ptr[id] = curval; } - bool isMarked(size_t id) const { return ptr[id] == curval; } -}; - -struct VisitedSetPool -{ - std::unique_ptr lastUsed; - VisitedSetPool() { - lastUsed = std::make_unique(250); - } - ~VisitedSetPool() {} - VisitedSet &get(size_t size) { - if (size > lastUsed->sz) { - lastUsed = std::make_unique(size*2); - } else { - lastUsed->clear(); - } - return *lastUsed; - } -}; - -struct HnswHit { - double dist; - uint32_t docid; - HnswHit(uint32_t di, SqDist sq) noexcept : dist(sq.distance), docid(di) {} -}; - -struct GreaterDist { - bool operator() (const HnswHit &lhs, const HnswHit& rhs) const { - return (rhs.dist < lhs.dist); - } -}; -struct LesserDist { - bool operator() (const HnswHit &lhs, const HnswHit& rhs) const { - return (lhs.dist < rhs.dist); - } -}; - -using NearestList = std::vector; - -struct NearestPriQ : std::priority_queue -{ -}; - -struct FurthestPriQ : std::priority_queue -{ - NearestList steal() { - NearestList result; - c.swap(result); - return result; - } - const NearestList& peek() const { return c; } -}; - -class HnswLikeNns : public NNS -{ -private: - std::vector _nodes; - uint32_t _entryId; - int _entryLevel; - uint32_t _M; - uint32_t _efConstruction; - double _levelMultiplier; - RndGen _rndGen; - VisitedSetPool _visitedSetPool; - size_t _ops_counter; - - double distance(Vector v, uint32_t id) const; - - double distance(uint32_t a, uint32_t b) const { - Vector v = _dva.get(a); - return distance(v, b); - } - - int randomLevel() { - double unif = _rndGen.nextUniform(); - double r = -log(1.0-unif) * _levelMultiplier; - return (int) r; - } - - uint32_t count_reachable() const; - void dumpStats() const; - -public: - HnswLikeNns(uint32_t numDims, const DocVectorAccess &dva); - ~HnswLikeNns() { dumpStats(); } - - LinkList& getLinkList(uint32_t docid, uint32_t level) { - return _nodes[docid]._links[level]; - } - - const LinkList& getLinkList(uint32_t docid, uint32_t level) const { - return _nodes[docid]._links[level]; - } - - HnswHit search_layer_simple(Vector vector, HnswHit curPoint, uint32_t searchLevel); - - void search_layer(Vector vector, FurthestPriQ &w, - uint32_t ef, uint32_t searchLevel); - void search_layer(Vector vector, FurthestPriQ &w, - VisitedSet &visited, - uint32_t ef, uint32_t searchLevel); - void search_layer_with_filter(Vector vector, FurthestPriQ &w, - uint32_t ef, uint32_t searchLevel, - const BitVector &skipDocIds); - void search_layer_with_filter(Vector vector, FurthestPriQ &w, - VisitedSet &visited, - uint32_t ef, uint32_t searchLevel, - const BitVector &skipDocIds); - - bool haveCloserDistance(HnswHit e, const LinkList &r) const; - - LinkList select_neighbors(const NearestList &neighbors, uint32_t curMax) const; - - LinkList remove_weakest(const NearestList &neighbors, uint32_t curMax, LinkList &removed) const; - - void addDoc(uint32_t docid) override; - - void track_ops(); - - void remove_link_from(uint32_t from_id, uint32_t remove_id, uint32_t level) { - LinkList &links = getLinkList(from_id, level); - links.remove_link(remove_id); - } - - void refill_ifneeded(uint32_t my_id, const LinkList &replacements, uint32_t level); - - void connect_new_node(uint32_t id, const LinkList &neighbors, uint32_t level); - - void shrink_links(uint32_t shrink_id, uint32_t maxLinks, uint32_t level); - - void each_shrink_ifneeded(const LinkList &neighbors, uint32_t level); - - void removeDoc(uint32_t docid) override; - - std::vector topK(uint32_t k, Vector vector, uint32_t search_k) override; - - std::vector topKfilter(uint32_t k, Vector vector, uint32_t search_k, const BitVector &skipDocIds) override; -}; diff --git a/eval/src/tests/ann/nns-l2.h b/eval/src/tests/ann/nns-l2.h deleted file mode 100644 index c7a711466fa5..000000000000 --- a/eval/src/tests/ann/nns-l2.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include -#include -#include - -template -static double hw_l2_sq_dist(const T * af, const T * bf, size_t sz) -{ - constexpr const size_t OpsPerV = VLEN/sizeof(T); - typedef T V __attribute__ ((vector_size (VLEN), aligned(VLEN))); - - const V * a = reinterpret_cast(af); - const V * b = reinterpret_cast(bf); - - V tmp_diff; - V tmp_squa; - V tmp_sum; - memset(&tmp_sum, 0, sizeof(tmp_sum)); - - const size_t numOps = sz/OpsPerV; - for (size_t i = 0; i < numOps; ++i) { - tmp_diff = a[i] - b[i]; - tmp_squa = tmp_diff * tmp_diff; - tmp_sum += tmp_squa; - } - double sum = 0; - for (size_t i = 0; i < OpsPerV; ++i) { - sum += tmp_sum[i]; - } - return sum; -} - -template -struct L2DistCalc { - const vespalib::hwaccelerated::IAccelerated & _hw; - - L2DistCalc() : _hw(vespalib::hwaccelerated::IAccelerated::getAccelerator()) {} - - using Arr = std::span; - using ConstArr = std::span; - - double product(const FltType *v1, const FltType *v2, size_t sz) { - return _hw.dotProduct(v1, v2, sz); - } - double product(ConstArr v1, ConstArr v2) { - const FltType *p1 = v1.data(); - const FltType *p2 = v2.data(); - return _hw.dotProduct(p1, p2, v1.size()); - } - double l2sq(ConstArr vector) { - const FltType *v = vector.data(); - return _hw.dotProduct(v, v, vector.size()); - } - double l2sq_dist(ConstArr v1, ConstArr v2, Arr tmp) { - for (size_t i = 0; i < v1.size(); ++i) { - tmp[i] = (v1[i] - v2[i]); - } - return l2sq(tmp); - } - double l2sq_dist(ConstArr v1, ConstArr v2) { - return hw_l2_sq_dist(v1.data(), v2.data(), v1.size()); - } -}; - -static L2DistCalc l2distCalc; diff --git a/eval/src/tests/ann/nns.h b/eval/src/tests/ann/nns.h deleted file mode 100644 index 67275c9fa509..000000000000 --- a/eval/src/tests/ann/nns.h +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once -#include "doc_vector_access.h" -#include "nns-l2.h" -#include -#include - -struct SqDist { - double distance; - explicit SqDist(double d) noexcept : distance(d) {} -}; - -struct NnsHit { - uint32_t docid; - SqDist sq; - NnsHit(uint32_t di, SqDist sqD) noexcept - : docid(di), sq(sqD) {} -}; -struct NnsHitComparatorLessDistance { - bool operator() (const NnsHit &lhs, const NnsHit& rhs) const { - if (lhs.sq.distance > rhs.sq.distance) return false; - if (lhs.sq.distance < rhs.sq.distance) return true; - return (lhs.docid > rhs.docid); - } -}; -struct NnsHitComparatorGreaterDistance { - bool operator() (const NnsHit &lhs, const NnsHit& rhs) const { - if (lhs.sq.distance < rhs.sq.distance) return false; - if (lhs.sq.distance > rhs.sq.distance) return true; - return (lhs.docid > rhs.docid); - } -}; -struct NnsHitComparatorLessDocid { - bool operator() (const NnsHit &lhs, const NnsHit& rhs) const { - return (lhs.docid < rhs.docid); - } -}; - -class BitVector { -private: - std::vector _bits; -public: - BitVector(size_t sz) : _bits((sz+63)/64) {} - BitVector& setBit(size_t idx) { - uint64_t mask = 1; - mask <<= (idx%64); - _bits[idx/64] |= mask; - return *this; - } - bool isSet(size_t idx) const { - uint64_t mask = 1; - mask <<= (idx%64); - uint64_t word = _bits[idx/64]; - return (word & mask) != 0; - } - BitVector& clearBit(size_t idx) { - uint64_t mask = 1; - mask <<= (idx%64); - _bits[idx/64] &= ~mask; - return *this; - } -}; - -template -class NNS -{ -public: - NNS(uint32_t numDims, const DocVectorAccess &dva) - : _numDims(numDims), _dva(dva) - {} - - virtual void addDoc(uint32_t docid) = 0; - virtual void removeDoc(uint32_t docid) = 0; - - using Vector = std::span; - virtual std::vector topK(uint32_t k, Vector vector, uint32_t search_k) = 0; - virtual std::vector topKfilter(uint32_t k, Vector vector, uint32_t search_k, const BitVector &skipDocIds) = 0; - virtual ~NNS() {} -protected: - uint32_t _numDims; - const DocVectorAccess &_dva; -}; - -extern -std::unique_ptr> -make_annoy_nns(uint32_t numDims, const DocVectorAccess &dva); - -extern -std::unique_ptr> -make_rplsh_nns(uint32_t numDims, const DocVectorAccess &dva); - -extern -std::unique_ptr> -make_hnsw_nns(uint32_t numDims, const DocVectorAccess &dva); - -extern -std::unique_ptr> -make_hnsw_wrap(uint32_t numDims, const DocVectorAccess &dva); diff --git a/eval/src/tests/ann/point-vector.h b/eval/src/tests/ann/point-vector.h deleted file mode 100644 index f2e1b1549ba6..000000000000 --- a/eval/src/tests/ann/point-vector.h +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -struct PointVector { - float v[NUM_DIMS]; - using ConstArr = std::span; - operator ConstArr() const { return ConstArr(v, NUM_DIMS); } -}; - -static PointVector *aligned_alloc(size_t num) { - size_t num_bytes = num * sizeof(PointVector); - double mega_bytes = num_bytes / (1024.0*1024.0); - fprintf(stderr, "allocate %.2f MB of vectors\n", mega_bytes); - char *mem = (char *)malloc(num_bytes + 512); - mem += 512; - size_t val = (size_t)mem; - size_t unalign = val % 512; - mem -= unalign; - return reinterpret_cast(mem); -} - -static PointVector *generatedQueries = aligned_alloc(NUM_Q); -static PointVector *generatedDocs = aligned_alloc(NUM_DOCS); - -struct DocVectorAdapter : public DocVectorAccess -{ - std::span get(uint32_t docid) const override { - ASSERT_TRUE(docid < NUM_DOCS); - return generatedDocs[docid]; - } -}; diff --git a/eval/src/tests/ann/quality-nns.h b/eval/src/tests/ann/quality-nns.h deleted file mode 100644 index 0ad50879b7e4..000000000000 --- a/eval/src/tests/ann/quality-nns.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -bool reach_with_nns_k(NNS_API &nns, uint32_t docid, uint32_t k) { - const PointVector &qv = generatedDocs[docid]; - std::span query(qv.v, NUM_DIMS); - auto rv = nns.topK(k, query, k); - if (rv.size() != k) { - fprintf(stderr, "Result/K=%u from query for %u is %zu hits\n", - k, docid, rv.size()); - return false; - } - if (rv[0].docid != docid) { - if (rv[0].sq.distance != 0.0) - fprintf(stderr, "Expected/K=%u to find %u but got %u with sq distance %.3f\n", - k, docid, rv[0].docid, rv[0].sq.distance); - } - return (rv[0].docid == docid || rv[0].sq.distance == 0.0); -} - -void quality_nns(NNS_API &nns, std::vector sk_list) { - for (uint32_t search_k : sk_list) { - double sum_recall = 0; - for (int cnt = 0; cnt < NUM_Q; ++cnt) { - sum_recall += verify_nns_quality(search_k, nns, cnt); - } - fprintf(stderr, "Overall average recall: %.2f\n", sum_recall / NUM_Q); - } - for (uint32_t search_k : { 1, 10, 100, 1000 }) { - TimePoint bef = std::chrono::steady_clock::now(); - uint32_t reached = 0; - for (uint32_t i = 0; i < NUM_REACH; ++i) { - uint32_t target = i * (NUM_DOCS / NUM_REACH); - if (reach_with_nns_k(nns, target, search_k)) ++reached; - } - fprintf(stderr, "Could reach %u of %u documents with k=%u\n", - reached, NUM_REACH, search_k); - TimePoint aft = std::chrono::steady_clock::now(); - fprintf(stderr, "reach time k=%u: %.3f ms = %.3f ms/q\n", - search_k, to_ms(aft - bef), to_ms(aft - bef)/NUM_REACH); - if (reached == NUM_REACH) break; - } -} diff --git a/eval/src/tests/ann/read-vecs.h b/eval/src/tests/ann/read-vecs.h deleted file mode 100644 index 704c796bdaed..000000000000 --- a/eval/src/tests/ann/read-vecs.h +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -void read_queries(std::string fn) { - int fd = open(fn.c_str(), O_RDONLY); - ASSERT_TRUE(fd > 0); - int d; - size_t rv; - fprintf(stderr, "reading %u queries from %s\n", NUM_Q, fn.c_str()); - for (uint32_t qid = 0; qid < NUM_Q; ++qid) { - rv = read(fd, &d, 4); - ASSERT_EQUAL(rv, 4u); - ASSERT_EQUAL(d, NUM_DIMS); - rv = read(fd, &generatedQueries[qid].v, NUM_DIMS*sizeof(float)); - ASSERT_EQUAL(rv, sizeof(PointVector)); - } - close(fd); -} - -void read_docs(std::string fn) { - int fd = open(fn.c_str(), O_RDONLY); - ASSERT_TRUE(fd > 0); - int d; - size_t rv; - fprintf(stderr, "reading %u doc vectors from %s\n", NUM_DOCS, fn.c_str()); - for (uint32_t docid = 0; docid < NUM_DOCS; ++docid) { - rv = read(fd, &d, 4); - ASSERT_EQUAL(rv, 4u); - ASSERT_EQUAL(d, NUM_DIMS); - rv = read(fd, &generatedDocs[docid].v, NUM_DIMS*sizeof(float)); - ASSERT_EQUAL(rv, sizeof(PointVector)); - } - close(fd); -} - -void read_data(const std::string& dir, const std::string& data_set) { - fprintf(stderr, "read data set '%s' from directory '%s'\n", data_set.c_str(), dir.c_str()); - TimePoint bef = std::chrono::steady_clock::now(); - read_queries(dir + "/" + data_set + "_query.fvecs"); - TimePoint aft = std::chrono::steady_clock::now(); - fprintf(stderr, "read queries: %.3f ms\n", to_ms(aft - bef)); - bef = std::chrono::steady_clock::now(); - read_docs(dir + "/" + data_set + "_base.fvecs"); - aft = std::chrono::steady_clock::now(); - fprintf(stderr, "read docs: %.3f ms\n", to_ms(aft - bef)); -} diff --git a/eval/src/tests/ann/remove-bm.cpp b/eval/src/tests/ann/remove-bm.cpp deleted file mode 100644 index 6b165dbb1f80..000000000000 --- a/eval/src/tests/ann/remove-bm.cpp +++ /dev/null @@ -1,248 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include -#include -#include -#include -#include -#include -#include -#include - -#define NUM_DIMS 960 -#define NUM_DOCS 250000 -#define NUM_DOCS_REMOVE 50000 -#define EFFECTIVE_DOCS (NUM_DOCS - NUM_DOCS_REMOVE) -#define NUM_REACH 10000 -#define NUM_Q 1000 - -#include "doc_vector_access.h" -#include "nns.h" -#include "for-sift-hit.h" -#include "for-sift-top-k.h" -#include "time-util.h" -#include "point-vector.h" -#include "read-vecs.h" -#include "bruteforce-nns.h" - -using NNS_API = NNS; - -#if 1 -TEST("require that HNSW via NNS api remove all works") { - DocVectorAdapter adapter; - std::unique_ptr nns = make_hnsw_nns(NUM_DIMS, adapter); - fprintf(stderr, "adding and removing all docs forward...\n"); - for (uint32_t i = 0; i < 1000; ++i) { - nns->addDoc(i); - } - for (uint32_t i = 0; i < 1000; ++i) { - nns->removeDoc(i); - } - fprintf(stderr, "adding and removing all docs reverse...\n"); - for (uint32_t i = 1000; i < 2000; ++i) { - nns->addDoc(i); - } - for (uint32_t i = 2000; i-- > 1000; ) { - nns->removeDoc(i); - } -} -#endif - -TEST("require that brute force works") { - TimePoint bef = std::chrono::steady_clock::now(); - fprintf(stderr, "generating %u brute force results\n", NUM_Q); - bruteforceResults.reserve(NUM_Q); - for (uint32_t cnt = 0; cnt < NUM_Q; ++cnt) { - const PointVector &query = generatedQueries[cnt]; - bruteforceResults.emplace_back(bruteforce_nns(query)); - } - TimePoint aft = std::chrono::steady_clock::now(); - fprintf(stderr, "timing for brute force: %.3f ms = %.3f ms per query\n", - to_ms(aft - bef), to_ms(aft - bef)/NUM_Q); - for (int cnt = 0; cnt < NUM_Q; cnt = (cnt+1)*2) { - verifyBF(cnt); - } -} - -#include "find-with-nns.h" -#include "verify-top-k.h" - -void timing_nns(const char *name, NNS_API &nns, std::vector sk_list) { - for (uint32_t search_k : sk_list) { - TimePoint bef = std::chrono::steady_clock::now(); - for (int cnt = 0; cnt < NUM_Q; ++cnt) { - find_with_nns(search_k, nns, cnt); - } - TimePoint aft = std::chrono::steady_clock::now(); - fprintf(stderr, "timing for %s search_k=%u: %.3f ms = %.3f ms/q\n", - name, search_k, to_ms(aft - bef), to_ms(aft - bef)/NUM_Q); - } -} - -#include "quality-nns.h" - -template -void bm_nns_simple(const char *name, FUNC creator, std::vector sk_list) { - std::unique_ptr nnsp = creator(); - NNS_API &nns = *nnsp; - fprintf(stderr, "trying %s indexing...\n", name); - TimePoint bef = std::chrono::steady_clock::now(); - for (uint32_t i = 0; i < EFFECTIVE_DOCS; ++i) { - nns.addDoc(i); - } - TimePoint aft = std::chrono::steady_clock::now(); - fprintf(stderr, "build %s index with %u docs: %.3f ms\n", name, EFFECTIVE_DOCS, to_ms(aft - bef)); - timing_nns(name, nns, sk_list); - fprintf(stderr, "Quality for %s [A] clean build with %u documents:\n", name, EFFECTIVE_DOCS); - quality_nns(nns, sk_list); - bef = std::chrono::steady_clock::now(); - for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) { - nns.addDoc(EFFECTIVE_DOCS + i); - } - for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) { - nns.removeDoc(EFFECTIVE_DOCS + i); - } - aft = std::chrono::steady_clock::now(); - fprintf(stderr, "build %s index add then remove %u docs: %.3f ms\n", - name, NUM_DOCS_REMOVE, to_ms(aft - bef)); - timing_nns(name, nns, sk_list); - fprintf(stderr, "Quality for %s [B] remove-damaged build with %u documents:\n", name, EFFECTIVE_DOCS); - quality_nns(nns, sk_list); -} - -template -void bm_nns_remove_old(const char *name, FUNC creator, std::vector sk_list) { - std::unique_ptr nnsp = creator(); - NNS_API &nns = *nnsp; - TimePoint bef = std::chrono::steady_clock::now(); - for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) { - nns.addDoc(EFFECTIVE_DOCS + i); - } - for (uint32_t i = 0; i < EFFECTIVE_DOCS; ++i) { - nns.addDoc(i); - } - for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) { - nns.removeDoc(EFFECTIVE_DOCS + i); - } - TimePoint aft = std::chrono::steady_clock::now(); - fprintf(stderr, "build %s index with %u docs: %.3f ms\n", name, EFFECTIVE_DOCS, to_ms(aft - bef)); - timing_nns(name, nns, sk_list); - fprintf(stderr, "Quality for %s [C] remove-oldest build with %u documents:\n", name, EFFECTIVE_DOCS); - quality_nns(nns, sk_list); -} - -template -void bm_nns_interleave(const char *name, FUNC creator, std::vector sk_list) { - std::unique_ptr nnsp = creator(); - NNS_API &nns = *nnsp; - TimePoint bef = std::chrono::steady_clock::now(); - for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) { - nns.addDoc(EFFECTIVE_DOCS + i); - } - for (uint32_t i = 0; i < EFFECTIVE_DOCS - NUM_DOCS_REMOVE; ++i) { - nns.addDoc(i); - } - for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) { - nns.removeDoc(EFFECTIVE_DOCS + i); - nns.addDoc(EFFECTIVE_DOCS - NUM_DOCS_REMOVE + i); - } - TimePoint aft = std::chrono::steady_clock::now(); - fprintf(stderr, "build %s index with %u docs: %.3f ms\n", name, EFFECTIVE_DOCS, to_ms(aft - bef)); - timing_nns(name, nns, sk_list); - fprintf(stderr, "Quality for %s [D] realistic build with %u documents:\n", name, EFFECTIVE_DOCS); - quality_nns(nns, sk_list); -} - -template -void bm_nns_remove_old_add_new(const char *name, FUNC creator, std::vector sk_list) { - std::unique_ptr nnsp = creator(); - NNS_API &nns = *nnsp; - TimePoint bef = std::chrono::steady_clock::now(); - for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) { - nns.addDoc(EFFECTIVE_DOCS + i); - } - for (uint32_t i = 0; i < EFFECTIVE_DOCS - NUM_DOCS_REMOVE; ++i) { - nns.addDoc(i); - } - for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) { - nns.removeDoc(EFFECTIVE_DOCS + i); - } - for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) { - nns.addDoc(EFFECTIVE_DOCS - NUM_DOCS_REMOVE + i); - } - TimePoint aft = std::chrono::steady_clock::now(); - fprintf(stderr, "build %s index with %u docs: %.3f ms\n", name, EFFECTIVE_DOCS, to_ms(aft - bef)); - timing_nns(name, nns, sk_list); - fprintf(stderr, "Quality for %s [E] remove old, add new build with %u documents:\n", name, EFFECTIVE_DOCS); - quality_nns(nns, sk_list); -} - -template -void benchmark_nns(const char *name, FUNC creator, std::vector sk_list) { - bm_nns_simple(name, creator, sk_list); - bm_nns_remove_old(name, creator, sk_list); - bm_nns_interleave(name, creator, sk_list); - bm_nns_remove_old_add_new(name, creator, sk_list); -} - -#if 0 -TEST("require that Locality Sensitive Hashing mostly works") { - DocVectorAdapter adapter; - auto creator = [&adapter]() { return make_rplsh_nns(NUM_DIMS, adapter); }; - benchmark_nns("RPLSH", creator, { 200, 1000 }); -} -#endif - -#if 0 -TEST("require that Annoy via NNS api mostly works") { - DocVectorAdapter adapter; - auto creator = [&adapter]() { return make_annoy_nns(NUM_DIMS, adapter); }; - benchmark_nns("Annoy", creator, { 8000, 10000 }); -} -#endif - -#if 1 -TEST("require that HNSW via NNS api mostly works") { - DocVectorAdapter adapter; - auto creator = [&adapter]() { return make_hnsw_nns(NUM_DIMS, adapter); }; - benchmark_nns("HNSW-like", creator, { 100, 150, 200 }); -} -#endif - -#if 0 -TEST("require that HNSW wrapped api mostly works") { - DocVectorAdapter adapter; - auto creator = [&adapter]() { return make_hnsw_wrap(NUM_DIMS, adapter); }; - benchmark_nns("HNSW-wrap", creator, { 100, 150, 200 }); -} -#endif - -/** - * Before running the benchmark the ANN_GIST1M data set must be downloaded and extracted: - * wget ftp://ftp.irisa.fr/local/texmex/corpus/gist.tar.gz - * tar -xf gist.tar.gz - * - * The benchmark program will load the data set from $HOME/gist if no directory is specified. - * - * More information about the dataset is found here: http://corpus-texmex.irisa.fr/. - */ -int main(int argc, char **argv) { - TEST_MASTER.init(__FILE__); - std::string data_set = "gist"; - std::string data_dir = "."; - if (argc > 2) { - data_set = argv[1]; - data_dir = argv[2]; - } else if (argc > 1) { - data_dir = argv[1]; - } else { - char *home = getenv("HOME"); - if (home) { - data_dir = home; - data_dir += "/" + data_set; - } - } - read_data(data_dir, data_set); - TEST_RUN_ALL(); - return (TEST_MASTER.fini() ? 0 : 1); -} diff --git a/eval/src/tests/ann/sift_benchmark.cpp b/eval/src/tests/ann/sift_benchmark.cpp deleted file mode 100644 index 7dc94c7ce34d..000000000000 --- a/eval/src/tests/ann/sift_benchmark.cpp +++ /dev/null @@ -1,256 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define NUM_DIMS 128 -#define NUM_DOCS 1000000 -#define EFFECTIVE_DOCS NUM_DOCS -#define NUM_Q 1000 -#define NUM_REACH 10000 - -#include "doc_vector_access.h" -#include "nns.h" -#include "for-sift-hit.h" -#include "for-sift-top-k.h" -#include "std-random.h" -#include "time-util.h" -#include "point-vector.h" -#include "read-vecs.h" -#include "bruteforce-nns.h" - -TopK bruteforce_nns_filter(const PointVector &query, const BitVector &skipDocIds) { - TopK result; - BfHitHeap heap(result.K); - for (uint32_t docid = 0; docid < NUM_DOCS; ++docid) { - if (skipDocIds.isSet(docid)) continue; - const PointVector &docvector = generatedDocs[docid]; - double d = l2distCalc.l2sq_dist(query, docvector); - Hit h(docid, d); - heap.maybe_use(h); - } - std::vector best = heap.bestHits(); - EXPECT_EQUAL(best.size(), result.K); - for (size_t i = 0; i < result.K; ++i) { - result.hits[i] = best[i]; - } - return result; -} - -void timing_bf_filter(int percent) -{ - BitVector skipDocIds(NUM_DOCS); - RndGen rnd; - for (uint32_t idx = 0; idx < NUM_DOCS; ++idx) { - if (rnd.nextUniform() < 0.01 * percent) { - skipDocIds.setBit(idx); - } else { - skipDocIds.clearBit(idx); - } - } - TimePoint bef = std::chrono::steady_clock::now(); - for (int cnt = 0; cnt < NUM_Q; ++cnt) { - const PointVector &qv = generatedQueries[cnt]; - auto res = bruteforce_nns_filter(qv, skipDocIds); - EXPECT_TRUE(res.hits[res.K - 1].distance > 0.0); - } - TimePoint aft = std::chrono::steady_clock::now(); - fprintf(stderr, "timing for bruteforce filter %d %%: %.3f ms = %.3f ms/q\n", - percent, to_ms(aft - bef), to_ms(aft - bef)/NUM_Q); -} - -TEST("require that brute force works") { - TimePoint bef = std::chrono::steady_clock::now(); - bruteforceResults.reserve(NUM_Q); - for (uint32_t cnt = 0; cnt < NUM_Q; ++cnt) { - const PointVector &query = generatedQueries[cnt]; - bruteforceResults.emplace_back(bruteforce_nns(query)); - } - TimePoint aft = std::chrono::steady_clock::now(); - fprintf(stderr, "timing for brute force: %.3f ms = %.3f ms per query\n", - to_ms(aft - bef), to_ms(aft - bef)/NUM_Q); - for (int cnt = 0; cnt < NUM_Q; cnt = (cnt+1)*2) { - verifyBF(cnt); - } -#if 1 - for (uint32_t filter_percent : { 0, 1, 10, 50, 90, 95, 99 }) { - timing_bf_filter(filter_percent); - } -#endif -} - -using NNS_API = NNS; - -size_t search_with_filter(uint32_t sk, NNS_API &nns, uint32_t qid, - const BitVector &skipDocIds) -{ - const PointVector &qv = generatedQueries[qid]; - std::span query(qv.v, NUM_DIMS); - auto rv = nns.topKfilter(100, query, sk, skipDocIds); - return rv.size(); -} - -#include "find-with-nns.h" -#include "verify-top-k.h" - -void verify_with_filter(uint32_t sk, NNS_API &nns, uint32_t qid, - const BitVector &skipDocIds) -{ - const PointVector &qv = generatedQueries[qid]; - auto expected = bruteforce_nns_filter(qv, skipDocIds); - std::span query(qv.v, NUM_DIMS); - auto rv = nns.topKfilter(expected.K, query, sk, skipDocIds); - TopK actual; - for (size_t i = 0; i < actual.K; ++i) { - actual.hits[i] = Hit(rv[i].docid, rv[i].sq.distance); - } - verify_top_k(expected, actual, sk, qid); -} - -void timing_nns_filter(const char *name, NNS_API &nns, - std::vector sk_list, int percent) -{ - BitVector skipDocIds(NUM_DOCS); - RndGen rnd; - for (uint32_t idx = 0; idx < NUM_DOCS; ++idx) { - if (rnd.nextUniform() < 0.01 * percent) { - skipDocIds.setBit(idx); - } else { - skipDocIds.clearBit(idx); - } - } - for (uint32_t search_k : sk_list) { - TimePoint bef = std::chrono::steady_clock::now(); - for (int cnt = 0; cnt < NUM_Q; ++cnt) { - uint32_t nh = search_with_filter(search_k, nns, cnt, skipDocIds); - EXPECT_EQUAL(nh, 100u); - } - TimePoint aft = std::chrono::steady_clock::now(); - fprintf(stderr, "timing for %s filter %d %% search_k=%u: %.3f ms = %.3f ms/q\n", - name, percent, search_k, to_ms(aft - bef), to_ms(aft - bef)/NUM_Q); -#if 0 - fprintf(stderr, "Quality check for %s filter %d %%:\n", name, percent); - for (int cnt = 0; cnt < NUM_Q; ++cnt) { - verify_with_filter(search_k, nns, cnt, skipDocIds); - } -#endif - } -} - -void timing_nns(const char *name, NNS_API &nns, std::vector sk_list) { - for (uint32_t search_k : sk_list) { - TimePoint bef = std::chrono::steady_clock::now(); - for (int cnt = 0; cnt < NUM_Q; ++cnt) { - find_with_nns(search_k, nns, cnt); - } - TimePoint aft = std::chrono::steady_clock::now(); - fprintf(stderr, "timing for %s search_k=%u: %.3f ms = %.3f ms/q\n", - name, search_k, to_ms(aft - bef), to_ms(aft - bef)/NUM_Q); - } -} - -#include "quality-nns.h" - -template -void benchmark_nns(const char *name, FUNC creator, std::vector sk_list) { - fprintf(stderr, "trying %s indexing...\n", name); - std::unique_ptr nnsp = creator(); - NNS_API &nns = *nnsp; - TimePoint bef = std::chrono::steady_clock::now(); - for (uint32_t i = 0; i < NUM_DOCS; ++i) { - nns.addDoc(i); - } - fprintf(stderr, "added %u documents...\n", NUM_DOCS); - find_with_nns(1, nns, 0); - TimePoint aft = std::chrono::steady_clock::now(); - fprintf(stderr, "build %s index: %.3f ms\n", name, to_ms(aft - bef)); - - fprintf(stderr, "Timings for %s :\n", name); - timing_nns(name, nns, sk_list); - for (uint32_t filter_percent : { 0, 1, 10, 50, 90, 95, 99 }) { - timing_nns_filter(name, nns, sk_list, filter_percent); - } - fprintf(stderr, "Quality for %s :\n", name); - quality_nns(nns, sk_list); -} - -#if 0 -TEST("require that Locality Sensitive Hashing mostly works") { - DocVectorAdapter adapter; - auto creator = [&adapter]() { return make_rplsh_nns(NUM_DIMS, adapter); }; - benchmark_nns("RPLSH", creator, { 200, 1000 }); -} -#endif - -#if 1 -TEST("require that Annoy via NNS api mostly works") { - DocVectorAdapter adapter; - auto creator = [&adapter]() { return make_annoy_nns(NUM_DIMS, adapter); }; - benchmark_nns("Annoy", creator, { 8000, 10000 }); -} -#endif - -#if 1 -TEST("require that HNSW via NNS api mostly works") { - DocVectorAdapter adapter; - auto creator = [&adapter]() { return make_hnsw_nns(NUM_DIMS, adapter); }; - benchmark_nns("HNSW-like", creator, { 100, 150, 200 }); -} -#endif - -#if 0 -TEST("require that HNSW wrapped api mostly works") { - DocVectorAdapter adapter; - auto creator = [&adapter]() { return make_hnsw_wrap(NUM_DIMS, adapter); }; - benchmark_nns("HNSW-wrap", creator, { 100, 150, 200 }); -} -#endif - -/** - * Before running the benchmark the ANN_SIFT1M data set must be downloaded and extracted: - * wget ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz - * tar -xf sift.tar.gz - * - * To run the program: - * ./eval_sift_benchmark_app - * - * The benchmark program will load the data set from $HOME/sift if no directory is specified. - * - * - * The ANN_GIST1M data set can also be used (as it has the same file format): - * wget ftp://ftp.irisa.fr/local/texmex/corpus/gist.tar.gz - * tar -xf gist.tar.gz - * - * Note that #define NUM_DIMS must be changed to 960 before recompiling and running the program: - * ./eval_sift_benchmark_app gist - * - * - * More information about the datasets is found here: http://corpus-texmex.irisa.fr/. - */ -int main(int argc, char **argv) { - TEST_MASTER.init(__FILE__); - std::string data_set = "sift"; - std::string data_dir = "."; - if (argc > 2) { - data_set = argv[1]; - data_dir = argv[2]; - } else if (argc > 1) { - data_dir = argv[1]; - } else { - char *home = getenv("HOME"); - if (home) { - data_dir = home; - data_dir += "/" + data_set; - } - } - read_data(data_dir, data_set); - TEST_RUN_ALL(); - return (TEST_MASTER.fini() ? 0 : 1); -} diff --git a/eval/src/tests/ann/std-random.h b/eval/src/tests/ann/std-random.h deleted file mode 100644 index d0b81151ebf5..000000000000 --- a/eval/src/tests/ann/std-random.h +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once -#include - -class RndGen { -private: - std::mt19937_64 urng; - std::normal_distribution normRng; - std::uniform_real_distribution uf; -public: - RndGen() : urng(0x1234deadbeef5678uLL), normRng(), uf(0.0, 1.0) {} - - double nextNormal() { - return normRng(urng); - } - - double nextUniform() { - return uf(urng); - } -}; diff --git a/eval/src/tests/ann/time-util.h b/eval/src/tests/ann/time-util.h deleted file mode 100644 index 468cdffca7cd..000000000000 --- a/eval/src/tests/ann/time-util.h +++ /dev/null @@ -1,9 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -using TimePoint = std::chrono::steady_clock::time_point; -using Duration = std::chrono::steady_clock::duration; - -double to_ms(Duration elapsed) { - std::chrono::duration ms(elapsed); - return ms.count(); -} diff --git a/eval/src/tests/ann/verify-top-k.h b/eval/src/tests/ann/verify-top-k.h deleted file mode 100644 index 7cd0094c9b8e..000000000000 --- a/eval/src/tests/ann/verify-top-k.h +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -int verify_top_k(const TopK &perfect, const TopK &result, uint32_t sk, uint32_t qid) { - int recall = perfect.recall(result); - EXPECT_TRUE(recall > 40); - double sum_error = 0.0; - double c_factor = 1.0; - for (size_t i = 0; i < result.K; ++i) { - double factor = (result.hits[i].distance / perfect.hits[i].distance); - if (factor < 0.99 || factor > 25) { - fprintf(stderr, "hit[%zu] got distance %.3f, expected %.3f\n", - i, result.hits[i].distance, perfect.hits[i].distance); - } - sum_error += factor; - c_factor = std::max(c_factor, factor); - } - EXPECT_TRUE(c_factor < 1.5); - fprintf(stderr, "quality sk=%u: query %u: recall %d c2-factor %.3f avg c2: %.3f\n", - sk, qid, recall, c_factor, sum_error / result.K); - return recall; -} - -int verify_nns_quality(uint32_t sk, NNS_API &nns, uint32_t qid) { - TopK perfect = bruteforceResults[qid]; - TopK result = find_with_nns(sk, nns, qid); - return verify_top_k(perfect, result, sk, qid); -} diff --git a/eval/src/tests/ann/xp-annoy-nns.cpp b/eval/src/tests/ann/xp-annoy-nns.cpp deleted file mode 100644 index 4f6540d1a69d..000000000000 --- a/eval/src/tests/ann/xp-annoy-nns.cpp +++ /dev/null @@ -1,480 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "nns.h" -#include "std-random.h" -#include -#include -#include -#include -#include - -using V = std::span; -class AnnoyLikeNns; -inline namespace xpannoynns { struct Node; } - -static size_t plane_dist_cnt = 0; -static size_t w_cen_dist_cnt = 0; -static size_t leaf_split_cnt = 0; -static size_t find_top_k_cnt = 0; -static size_t find_cand_cnt = 0; - -using QueueNode = std::pair; -using NodeQueue = std::priority_queue; - -inline namespace xpannoynns { - -struct Node { - Node() {} - virtual ~Node() {} - virtual Node *addDoc(uint32_t docid, V vector, AnnoyLikeNns &meta) = 0; - virtual int remove(uint32_t docid, V vector) = 0; - virtual void findCandidates(std::set &cands, V vector, NodeQueue &queue, double minDist) const = 0; - virtual void filterCandidates(std::set &cands, V vector, NodeQueue &queue, double minDist, const BitVector &skipDocIds) const = 0; - virtual void stats(std::vector &depths) = 0; -}; - -} - -struct LeafNode : public Node { - std::vector docids; - - LeafNode() : Node(), docids() { docids.reserve(128); } - - Node *addDoc(uint32_t docid, V vector, AnnoyLikeNns &meta) override; - int remove(uint32_t docid, V vector) override; - void findCandidates(std::set &cands, V vector, NodeQueue &queue, double minDist) const override; - void filterCandidates(std::set &cands, V vector, NodeQueue &queue, double minDist, const BitVector &skipDocIds) const override; - - Node *split(AnnoyLikeNns &meta); - virtual void stats(std::vector &depths) override { depths.push_back(1); } -}; - -struct SplitNode : public Node { - std::vector hyperPlane; - double offsetFromOrigo; - Node *leftChildren; - Node *rightChildren; - - SplitNode() : Node(), hyperPlane(), offsetFromOrigo(), leftChildren(), rightChildren() {} - ~SplitNode(); - - Node *addDoc(uint32_t docid, V vector, AnnoyLikeNns &meta) override; - int remove(uint32_t docid, V vector) override; - void findCandidates(std::set &cands, V vector, NodeQueue &queue, double minDist) const override; - void filterCandidates(std::set &cands, V vector, NodeQueue &queue, double minDist, const BitVector &skipDocIds) const override; - - double planeDistance(V vector) const; - virtual void stats(std::vector &depths) override { - size_t i = depths.size(); - leftChildren->stats(depths); - rightChildren->stats(depths); - while (i < depths.size()) { ++depths[i++]; } - } -}; - -class AnnoyLikeNns : public NNS -{ -private: - std::vector _roots; - RndGen _rndGen; - static constexpr size_t numRoots = 50; - -public: - AnnoyLikeNns(uint32_t numDims, const DocVectorAccess &dva) - : NNS(numDims, dva), _roots(), _rndGen() - { - _roots.reserve(numRoots); - for (size_t i = 0; i < numRoots; ++i) { - _roots.push_back(new LeafNode()); - } - } - - void dumpStats(); - - ~AnnoyLikeNns() { - dumpStats(); - for (Node *root : _roots) { - delete root; - } - } - - void addDoc(uint32_t docid) override { - V vector = _dva.get(docid); - for (Node * &root : _roots) { - root = root->addDoc(docid, vector, *this); - } - } - - void removeDoc(uint32_t docid) override { - V vector = _dva.get(docid); - for (Node * root : _roots) { - root->remove(docid, vector); - } - } - std::vector topK(uint32_t k, Vector vector, uint32_t search_k) override; - - std::vector topKfilter(uint32_t k, Vector vector, uint32_t search_k, const BitVector &bitvector) override; - - V getVector(uint32_t docid) const { return _dva.get(docid); } - double uniformRnd() { return _rndGen.nextUniform(); } - uint32_t dims() const { return _numDims; } -}; - - -double -SplitNode::planeDistance(V vector) const -{ - ++plane_dist_cnt; - assert(vector.size() == hyperPlane.size()); - double dp = l2distCalc.product(&vector[0], &hyperPlane[0], vector.size()); - return dp - offsetFromOrigo; -} - - -Node * -LeafNode::addDoc(uint32_t docid, V, AnnoyLikeNns &meta) -{ - docids.push_back(docid); - if (docids.size() > 127) { - return split(meta); - } - return this; -} - -struct WeightedCentroid { - uint32_t cnt; - std::vector sum_point; - std::vector tmp_vector; - WeightedCentroid(V vector) - : cnt(1), sum_point(), tmp_vector(vector.size()) - { - sum_point.reserve(vector.size()); - for (float val : vector) { - sum_point.push_back(val); - } - } - void add_v(V vector) { - ++cnt; - for (size_t i = 0; i < vector.size(); ++i) { - sum_point[i] += vector[i]; - } - } - std::vector norm_diff(WeightedCentroid other) { - std::vector r; - const size_t sz = sum_point.size(); - double my_inv = 1.0 / cnt; - double ot_inv = 1.0 / other.cnt; - double sumSq = 0.0; - r.reserve(sz); - for (size_t i = 0; i < sz; ++i) { - double d = (sum_point[i] * my_inv) - (other.sum_point[i] * ot_inv); - r.push_back(d); - sumSq += d*d; - } - if (sumSq > 0) { - double invnorm = 1.0 / sqrt(sumSq); - for (size_t i = 0; i < sz; ++i) { - r[i] *= invnorm; - } - } - return r; - } - std::vector midpoint(WeightedCentroid other) { - std::vector r; - size_t sz = sum_point.size(); - r.reserve(sz); - double my_inv = 1.0 / cnt; - double ot_inv = 1.0 / other.cnt; - for (size_t i = 0; i < sz; ++i) { - double mp = (sum_point[i] * my_inv) + (other.sum_point[i] * ot_inv); - r.push_back(mp * 0.5); - } - return r; - } - double weightedDistance(V vector) { - ++w_cen_dist_cnt; - size_t sz = vector.size(); - for (size_t i = 0; i < sz; ++i) { - tmp_vector[i] = vector[i] * cnt; - } - return l2distCalc.l2sq_dist(tmp_vector, sum_point) / cnt; - } - ~WeightedCentroid() {} -}; - -Node * -LeafNode::split(AnnoyLikeNns &meta) -{ - ++leaf_split_cnt; - uint32_t dims = meta.dims(); - uint32_t retries = 3; -retry: - uint32_t p1i = uint32_t(meta.uniformRnd() * docids.size()); - uint32_t p2i = uint32_t(meta.uniformRnd() * (docids.size()-1)); - if (p2i >= p1i) ++p2i; - uint32_t p1d = docids[p1i]; - uint32_t p2d = docids[p2i]; - V p1 = meta.getVector(p1d); - V p2 = meta.getVector(p2d); - - double sumsq = 0; - for (size_t i = 0; i < dims; ++i) { - double d = p1[i] - p2[i]; - sumsq += d*d; - } - if ((!(sumsq > 0)) && (retries-- > 0)) { - goto retry; - } - WeightedCentroid centroid1(p1); - WeightedCentroid centroid2(p2); -#if 1 - for (size_t i = 0; (i * 1) < docids.size(); ++i) { - size_t p3i = (p1i + p2i + i) % docids.size(); - uint32_t p3d = docids[p3i]; - V p3 = meta.getVector(p3d); - double dist_c1 = centroid1.weightedDistance(p3); - double dist_c2 = centroid2.weightedDistance(p3); - bool use_c1 = false; - if (dist_c1 < dist_c2) { - use_c1 = true; - } else if (dist_c1 > dist_c2) { - use_c1 = false; - } else if (centroid1.cnt < centroid2.cnt) { - use_c1 = true; - } - if (use_c1) { - centroid1.add_v(p3); - } else { - centroid2.add_v(p3); - } - } -#endif - std::vector diff = centroid1.norm_diff(centroid2); - std::vector mp = centroid1.midpoint(centroid2); - double off = l2distCalc.product(diff, mp); - - SplitNode *s = new SplitNode(); - s->hyperPlane = std::move(diff); - s->offsetFromOrigo = off; - - std::vector leftDs; - std::vector rightDs; - leftDs.reserve(128); - rightDs.reserve(128); - - for (uint32_t docid : docids) { - V vector = meta.getVector(docid); - double dist = s->planeDistance(vector); - bool left = false; - if (dist < 0) { - left = true; - } else if (!(dist > 0)) { - left = (leftDs.size() < rightDs.size()); - } - if (left) { - leftDs.push_back(docid); - } else { - rightDs.push_back(docid); - } - } - -#if 0 - fprintf(stderr, "splitting leaf node numChildren %u\n", numChildren); - fprintf(stderr, "dims = %u\n", dims); - fprintf(stderr, "p1 idx=%u, docid=%u VSZ=%zu\n", p1i, p1d, p1.size()); - fprintf(stderr, "p2 idx=%u, docid=%u VSZ=%zu\n", p2i, p2d, p2.size()); - fprintf(stderr, "diff %zu sumsq = %g\n", diff.size(), sumsq); - fprintf(stderr, "offset from origo = %g\n", off); - fprintf(stderr, "split left=%zu, right=%zu\n", leftDs.size(), rightDs.size()); -#endif - - LeafNode *newRightNode = new LeafNode(); - newRightNode->docids = std::move(rightDs); - s->rightChildren = newRightNode; - this->docids = std::move(leftDs); - s->leftChildren = this; - return s; -} - -int -LeafNode::remove(uint32_t docid, V) -{ - auto iter = std::remove(docids.begin(), docids.end(), docid); - int removed = docids.end() - iter; - docids.erase(iter, docids.end()); - return removed; -} - -void -LeafNode::findCandidates(std::set &cands, V, NodeQueue &, double) const -{ - for (uint32_t d : docids) { - cands.insert(d); - } -} - -void -LeafNode::filterCandidates(std::set &cands, V, NodeQueue &, double, const BitVector &skipDocIds) const -{ - for (uint32_t d : docids) { - if (skipDocIds.isSet(d)) continue; - cands.insert(d); - } -} - - -SplitNode::~SplitNode() -{ - delete leftChildren; - delete rightChildren; -} - -Node * -SplitNode::addDoc(uint32_t docid, V vector, AnnoyLikeNns &meta) -{ - double d = planeDistance(vector); - if (d < 0) { - leftChildren = leftChildren->addDoc(docid, vector, meta); - } else { - rightChildren = rightChildren->addDoc(docid, vector, meta); - } - return this; -} - -int -SplitNode::remove(uint32_t docid, V vector) -{ - double d = planeDistance(vector); - if (d < 0) { - int r = leftChildren->remove(docid, vector); - return r; - } else { - int r = rightChildren->remove(docid, vector); - return r; - } -} - -void -SplitNode::findCandidates(std::set &, V vector, NodeQueue &queue, double minDist) const -{ - double d = planeDistance(vector); - // fprintf(stderr, "push 2 nodes dist %g\n", d); - queue.push(std::make_pair(std::min(-d, minDist), leftChildren)); - queue.push(std::make_pair(std::min(d, minDist), rightChildren)); -} - -void -SplitNode::filterCandidates(std::set &, V vector, NodeQueue &queue, double minDist, const BitVector &) const -{ - double d = planeDistance(vector); - // fprintf(stderr, "push 2 nodes dist %g\n", d); - queue.push(std::make_pair(std::min(-d, minDist), leftChildren)); - queue.push(std::make_pair(std::min(d, minDist), rightChildren)); -} - -std::vector -AnnoyLikeNns::topK(uint32_t k, Vector vector, uint32_t search_k) -{ - ++find_top_k_cnt; - std::vector tmp; - tmp.resize(_numDims); - std::span tmpArr(tmp); - - std::vector r; - r.reserve(k); - std::set candidates; - NodeQueue queue; - // fprintf(stderr, "find %u candidates\n", k); - for (Node *root : _roots) { - double dist = std::numeric_limits::max(); - queue.push(std::make_pair(dist, root)); - } - while ((candidates.size() < std::max(k, search_k)) && (queue.size() > 0)) { - const QueueNode& top = queue.top(); - double md = top.first; - // fprintf(stderr, "find candidates: node with min distance %g\n", md); - Node *n = top.second; - queue.pop(); - n->findCandidates(candidates, vector, queue, md); - ++find_cand_cnt; - } -#if 0 - while (queue.size() > 0) { - const QueueNode& top = queue.top(); - fprintf(stderr, "discard candidates: node with distance %g\n", top.first); - queue.pop(); - } -#endif - for (uint32_t docid : candidates) { - double dist = l2distCalc.l2sq_dist(vector, _dva.get(docid), tmpArr); - NnsHit hit(docid, SqDist(dist)); - r.push_back(hit); - } - std::sort(r.begin(), r.end(), NnsHitComparatorLessDistance()); - while (r.size() > k) r.pop_back(); - return r; -} - -std::vector -AnnoyLikeNns::topKfilter(uint32_t k, Vector vector, uint32_t search_k, const BitVector &skipDocIds) -{ - ++find_top_k_cnt; - std::vector r; - r.reserve(k); - std::set candidates; - NodeQueue queue; - for (Node *root : _roots) { - double dist = std::numeric_limits::max(); - queue.push(std::make_pair(dist, root)); - } - while ((candidates.size() < std::max(k, search_k)) && (queue.size() > 0)) { - const QueueNode& top = queue.top(); - double md = top.first; - // fprintf(stderr, "find candidates: node with min distance %g\n", md); - Node *n = top.second; - queue.pop(); - n->filterCandidates(candidates, vector, queue, md, skipDocIds); - ++find_cand_cnt; - } - for (uint32_t docid : candidates) { - if (skipDocIds.isSet(docid)) continue; - double dist = l2distCalc.l2sq_dist(vector, _dva.get(docid)); - NnsHit hit(docid, SqDist(dist)); - r.push_back(hit); - } - std::sort(r.begin(), r.end(), NnsHitComparatorLessDistance()); - while (r.size() > k) r.pop_back(); - return r; -} - - - -void -AnnoyLikeNns::dumpStats() { - fprintf(stderr, "stats for AnnoyLikeNns:\n"); - fprintf(stderr, "planeDistance() calls: %zu\n", plane_dist_cnt); - fprintf(stderr, "weightedDistance() calls: %zu\n", w_cen_dist_cnt); - fprintf(stderr, "leaf split() calls: %zu\n", leaf_split_cnt); - fprintf(stderr, "topK() calls: %zu\n", find_top_k_cnt); - fprintf(stderr, "findCandidates() calls: %zu\n", find_cand_cnt); - std::vector depths; - _roots[0]->stats(depths); - std::vector counts; - for (uint32_t deep : depths) { - while (counts.size() <= deep) counts.push_back(0); - counts[deep]++; - } - fprintf(stderr, "depths for %zu leaves [\n", depths.size()); - for (uint32_t deep = 0; deep < counts.size(); ++deep) { - if (counts[deep] > 0) { - fprintf(stderr, "%u deep count %u\n", deep, counts[deep]); - } - } - fprintf(stderr, "]\n"); -} - -std::unique_ptr> -make_annoy_nns(uint32_t numDims, const DocVectorAccess &dva) -{ - return std::make_unique(numDims, dva); -} diff --git a/eval/src/tests/ann/xp-hnsw-wrap.cpp b/eval/src/tests/ann/xp-hnsw-wrap.cpp deleted file mode 100644 index 586bc07109e9..000000000000 --- a/eval/src/tests/ann/xp-hnsw-wrap.cpp +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "nns.h" -#include -#include "/git/hnswlib/hnswlib/hnswlib.h" - -class HnswWrapNns : public NNS -{ -private: - using Implementation = hnswlib::HierarchicalNSW; - hnswlib::L2Space _l2space; - Implementation _hnsw; - -public: - HnswWrapNns(uint32_t numDims, const DocVectorAccess &dva) - : NNS(numDims, dva), - _l2space(numDims), - _hnsw(&_l2space, 2500000, 16, 200) - { - } - - ~HnswWrapNns() {} - - void addDoc(uint32_t docid) override { - Vector vector = _dva.get(docid); - _hnsw.addPoint(vector.cbegin(), docid); - } - - void removeDoc(uint32_t docid) override { - _hnsw.markDelete(docid); - } - - std::vector topK(uint32_t k, Vector vector, uint32_t search_k) override { - std::vector reversed; - _hnsw.setEf(search_k); - auto priQ = _hnsw.searchKnn(vector.cbegin(), k); - while (! priQ.empty()) { - auto pair = priQ.top(); - reversed.emplace_back(pair.second, SqDist(pair.first)); - priQ.pop(); - } - std::vector result; - while (result.size() < k && !reversed.empty()) { - result.push_back(reversed.back()); - reversed.pop_back(); - } - return result; - } - - std::vector topKfilter(uint32_t k, Vector vector, uint32_t search_k, const BitVector &skipDocIds) override { - std::vector reversed; - uint32_t adjusted_k = k+4; - uint32_t adjusted_sk = search_k+4; - for (int retry = 0; (retry < 5) && (reversed.size() < k); ++retry) { - reversed.clear(); - _hnsw.setEf(adjusted_sk); - auto priQ = _hnsw.searchKnn(vector.cbegin(), adjusted_k); - while (! priQ.empty()) { - auto pair = priQ.top(); - if (! skipDocIds.isSet(pair.second)) { - reversed.emplace_back(pair.second, SqDist(pair.first)); - } - priQ.pop(); - } - double got = 1 + reversed.size(); - double factor = 1.25 * k / got; - adjusted_k *= factor; - adjusted_sk *= factor; - } - std::vector result; - while (result.size() < k && !reversed.empty()) { - result.push_back(reversed.back()); - reversed.pop_back(); - } - return result; - } -}; - -std::unique_ptr> -make_hnsw_wrap(uint32_t numDims, const DocVectorAccess &dva) -{ - NNS *p = new HnswWrapNns(numDims, dva); - return std::unique_ptr>(p); -} diff --git a/eval/src/tests/ann/xp-hnswlike-nns.cpp b/eval/src/tests/ann/xp-hnswlike-nns.cpp deleted file mode 100644 index 72200bd4b0fa..000000000000 --- a/eval/src/tests/ann/xp-hnswlike-nns.cpp +++ /dev/null @@ -1,544 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "hnsw-like.h" - -/* - Todo: - - measure effect of: - 1) removing leftover backlinks during "shrink" operation - 2) refilling to low-watermark after 1) happens - 3) refilling to mid-watermark after 1) happens - 4) adding then removing 20% extra documents - 5) removing 20% first-added documents - 6) removing first-added documents while inserting new ones - - 7) auto-tune search_k to ensure >= 50% recall on 1000 Q with k=100 - 8) auto-tune search_k to ensure avg 90% recall on 1000 Q with k=100 - 9) auto-tune search_k to ensure >= 90% reachability of 10000 docids - - 10) timings for SIFT, GIST, and DEEP data (100k, 200k, 300k, 500k, 700k, 1000k) - */ - -static size_t distcalls_simple; -static size_t distcalls_search_layer; -static size_t distcalls_other; -static size_t distcalls_heuristic; -static size_t distcalls_shrink; -static size_t distcalls_refill; -static size_t refill_needed_calls; -static size_t shrink_needed_calls; -static size_t disconnected_weak_links; -static size_t disconnected_for_symmetry; -static size_t select_n_full; -static size_t select_n_partial; - - - -HnswLikeNns::HnswLikeNns(uint32_t numDims, const DocVectorAccess &dva) - : NNS(numDims, dva), - _nodes(), - _entryId(0), - _entryLevel(-1), - _M(16), - _efConstruction(200), - _levelMultiplier(1.0 / log(1.0 * _M)), - _rndGen(), - _ops_counter(0) -{ -} - -// simple greedy search -HnswHit -HnswLikeNns::search_layer_simple(Vector vector, HnswHit curPoint, uint32_t searchLevel) { - bool keepGoing = true; - while (keepGoing) { - keepGoing = false; - const LinkList& neighbors = getLinkList(curPoint.docid, searchLevel); - for (uint32_t n_id : neighbors) { - double dist = distance(vector, n_id); - ++distcalls_simple; - if (dist < curPoint.dist) { - curPoint = HnswHit(n_id, SqDist(dist)); - keepGoing = true; - } - } - } - return curPoint; -} - -bool -HnswLikeNns::haveCloserDistance(HnswHit e, const LinkList &r) const { - for (uint32_t prevId : r) { - double dist = distance(e.docid, prevId); - ++distcalls_heuristic; - if (dist < e.dist) return true; - } - return false; -} - -void -HnswLikeNns::addDoc(uint32_t docid) { - Vector vector = _dva.get(docid); - for (uint32_t id = _nodes.size(); id <= docid; ++id) { - _nodes.emplace_back(id, 0, _M); - } - int level = randomLevel(); - assert(_nodes[docid]._links.size() == 0); - _nodes[docid] = Node(docid, level+1, _M); - if (_entryLevel < 0) { - _entryId = docid; - _entryLevel = level; - track_ops(); - return; - } - int searchLevel = _entryLevel; - double entryDist = distance(vector, _entryId); - ++distcalls_other; - HnswHit entryPoint(_entryId, SqDist(entryDist)); - while (searchLevel > level) { - entryPoint = search_layer_simple(vector, entryPoint, searchLevel); - --searchLevel; - } - searchLevel = std::min(level, _entryLevel); - FurthestPriQ w; - w.push(entryPoint); - while (searchLevel >= 0) { - search_layer(vector, w, _efConstruction, searchLevel); - LinkList neighbors = select_neighbors(w.peek(), _M); - connect_new_node(docid, neighbors, searchLevel); - each_shrink_ifneeded(neighbors, searchLevel); - --searchLevel; - } - if (level > _entryLevel) { - _entryLevel = level; - _entryId = docid; - } - track_ops(); -} - -void -HnswLikeNns::track_ops() { - _ops_counter++; - if ((_ops_counter % 10000) == 0) { - double div = _ops_counter; - fprintf(stderr, "add / remove ops: %zu\n", _ops_counter); - fprintf(stderr, "distance calls for layer: %zu is %.3f per op\n", distcalls_search_layer, distcalls_search_layer/ div); - fprintf(stderr, "distance calls for heuristic: %zu is %.3f per op\n", distcalls_heuristic, distcalls_heuristic / div); - fprintf(stderr, "distance calls for simple: %zu is %.3f per op\n", distcalls_simple, distcalls_simple / div); - fprintf(stderr, "distance calls for shrink: %zu is %.3f per op\n", distcalls_shrink, distcalls_shrink / div); - fprintf(stderr, "distance calls for refill: %zu is %.3f per op\n", distcalls_refill, distcalls_refill / div); - fprintf(stderr, "distance calls for other: %zu is %.3f per op\n", distcalls_other, distcalls_other / div); - fprintf(stderr, "refill needed calls: %zu is %.3f per op\n", refill_needed_calls, refill_needed_calls / div); - fprintf(stderr, "shrink needed calls: %zu is %.3f per op\n", shrink_needed_calls, shrink_needed_calls / div); - fprintf(stderr, "disconnected weak links: %zu is %.3f per op\n", disconnected_weak_links, disconnected_weak_links / div); - fprintf(stderr, "disconnected for symmetry: %zu is %.3f per op\n", disconnected_for_symmetry, disconnected_for_symmetry / div); - fprintf(stderr, "select neighbors: partial %zu vs full %zu\n", select_n_partial, select_n_full); - } -} - -void -HnswLikeNns::refill_ifneeded(uint32_t my_id, const LinkList &replacements, uint32_t level) { - LinkList &my_links = getLinkList(my_id, level); - if (my_links.size() < 8) { - ++refill_needed_calls; - for (uint32_t repl_id : replacements) { - if (repl_id == my_id) continue; - if (my_links.has_link_to(repl_id)) continue; - LinkList &other_links = getLinkList(repl_id, level); - if (other_links.size() + 1 >= _M) continue; - other_links.push_back(my_id); - my_links.push_back(repl_id); - if (my_links.size() >= _M) return; - } - } -} - -void -HnswLikeNns::shrink_links(uint32_t shrink_id, uint32_t maxLinks, uint32_t level) { - LinkList &links = getLinkList(shrink_id, level); - NearestList distances; - for (uint32_t n_id : links) { - double n_dist = distance(shrink_id, n_id); - ++distcalls_shrink; - distances.emplace_back(n_id, SqDist(n_dist)); - } - LinkList lostLinks; - LinkList oldLinks = links; - links = remove_weakest(distances, maxLinks, lostLinks); -#define KEEP_SYM -#ifdef KEEP_SYM - for (uint32_t lost_id : lostLinks) { - ++disconnected_for_symmetry; - remove_link_from(lost_id, shrink_id, level); - } -#define DO_REFILL_AFTER_KEEP_SYM -#ifdef DO_REFILL_AFTER_KEEP_SYM - for (uint32_t lost_id : lostLinks) { - refill_ifneeded(lost_id, oldLinks, level); - } -#endif -#endif -} - -void -HnswLikeNns::removeDoc(uint32_t docid) { - Node &node = _nodes[docid]; - bool need_new_entrypoint = (docid == _entryId); - for (int level = node._links.size(); level-- > 0; ) { - LinkList my_links; - my_links.swap(node._links[level]); - for (uint32_t n_id : my_links) { - if (need_new_entrypoint) { - _entryId = n_id; - _entryLevel = level; - need_new_entrypoint = false; - } - remove_link_from(n_id, docid, level); - } - while (! my_links.empty()) { - uint32_t n_id = my_links.back(); - my_links.pop_back(); - refill_ifneeded(n_id, my_links, level); - } - } - node = Node(docid, 0, _M); - if (need_new_entrypoint) { - _entryLevel = -1; - _entryId = 0; - for (uint32_t i = 0; i < _nodes.size(); ++i) { - if (_nodes[i]._links.size() > 0) { - _entryId = i; - _entryLevel = _nodes[i]._links.size() - 1; - break; - } - } - } - track_ops(); -} - -std::vector -HnswLikeNns::topK(uint32_t k, Vector vector, uint32_t search_k) { - std::vector result; - if (_entryLevel < 0) return result; - double entryDist = distance(vector, _entryId); - ++distcalls_other; - HnswHit entryPoint(_entryId, SqDist(entryDist)); - int searchLevel = _entryLevel; - while (searchLevel > 0) { - entryPoint = search_layer_simple(vector, entryPoint, searchLevel); - --searchLevel; - } - FurthestPriQ w; - w.push(entryPoint); - search_layer(vector, w, std::max(k, search_k), 0); - while (w.size() > k) { - w.pop(); - } - NearestList tmp = w.steal(); - std::sort(tmp.begin(), tmp.end(), LesserDist()); - result.reserve(tmp.size()); - for (const auto & hit : tmp) { - result.emplace_back(hit.docid, SqDist(hit.dist)); - } - return result; -} - - -double -HnswLikeNns::distance(Vector v, uint32_t b) const -{ - Vector w = _dva.get(b); - return l2distCalc.l2sq_dist(v, w); -} - -std::vector -HnswLikeNns::topKfilter(uint32_t k, Vector vector, uint32_t search_k, const BitVector &skipDocIds) -{ - std::vector result; - if (_entryLevel < 0) return result; - double entryDist = distance(vector, _entryId); - ++distcalls_other; - HnswHit entryPoint(_entryId, SqDist(entryDist)); - int searchLevel = _entryLevel; - while (searchLevel > 0) { - entryPoint = search_layer_simple(vector, entryPoint, searchLevel); - --searchLevel; - } - FurthestPriQ w; - w.push(entryPoint); - search_layer_with_filter(vector, w, std::max(k, search_k), 0, skipDocIds); - NearestList tmp = w.steal(); - std::sort(tmp.begin(), tmp.end(), LesserDist()); - result.reserve(std::min((size_t)k, tmp.size())); - for (const auto & hit : tmp) { - if (skipDocIds.isSet(hit.docid)) continue; - result.emplace_back(hit.docid, SqDist(hit.dist)); - if (result.size() == k) break; - } - return result; -} - -void -HnswLikeNns::each_shrink_ifneeded(const LinkList &neighbors, uint32_t level) { - uint32_t maxLinks = (level > 0) ? _M : (2 * _M); - for (uint32_t old_id : neighbors) { - LinkList &oldLinks = getLinkList(old_id, level); - if (oldLinks.size() > maxLinks) { - ++shrink_needed_calls; - shrink_links(old_id, maxLinks, level); - } - } -} - -void -HnswLikeNns::search_layer(Vector vector, FurthestPriQ &w, - uint32_t ef, uint32_t searchLevel) -{ - NearestPriQ candidates; - VisitedSet &visited = _visitedSetPool.get(_nodes.size()); - - for (const HnswHit & entry : w.peek()) { - candidates.push(entry); - visited.mark(entry.docid); - } - double limd = std::numeric_limits::max(); - while (! candidates.empty()) { - HnswHit cand = candidates.top(); - if (cand.dist > limd) { - break; - } - candidates.pop(); - for (uint32_t e_id : getLinkList(cand.docid, searchLevel)) { - if (visited.isMarked(e_id)) continue; - visited.mark(e_id); - double e_dist = distance(vector, e_id); - ++distcalls_search_layer; - if (e_dist < limd) { - candidates.emplace(e_id, SqDist(e_dist)); - w.emplace(e_id, SqDist(e_dist)); - if (w.size() > ef) { - w.pop(); - limd = w.top().dist; - } - } - } - } - return; -} - -void -HnswLikeNns::search_layer_with_filter(Vector vector, FurthestPriQ &w, - uint32_t ef, uint32_t searchLevel, - const BitVector &skipDocIds) -{ - NearestPriQ candidates; - VisitedSet &visited = _visitedSetPool.get(_nodes.size()); - - for (const HnswHit & entry : w.peek()) { - candidates.push(entry); - visited.mark(entry.docid); - if (skipDocIds.isSet(entry.docid)) ++ef; - } - double limd = std::numeric_limits::max(); - while (! candidates.empty()) { - HnswHit cand = candidates.top(); - if (cand.dist > limd) { - break; - } - candidates.pop(); - for (uint32_t e_id : getLinkList(cand.docid, searchLevel)) { - if (visited.isMarked(e_id)) continue; - visited.mark(e_id); - double e_dist = distance(vector, e_id); - ++distcalls_search_layer; - if (e_dist < limd) { - candidates.emplace(e_id, SqDist(e_dist)); - if (skipDocIds.isSet(e_id)) continue; - w.emplace(e_id, SqDist(e_dist)); - if (w.size() > ef) { - w.pop(); - limd = w.top().dist; - } - } - } - } -} - -LinkList -HnswLikeNns::remove_weakest(const NearestList &neighbors, uint32_t curMax, LinkList &lost) const -{ - LinkList result; - result.reserve(curMax+1); - NearestPriQ w; - for (const auto & entry : neighbors) { - w.push(entry); - } - while (! w.empty()) { - HnswHit e = w.top(); - w.pop(); - if (result.size() == curMax || haveCloserDistance(e, result)) { - lost.push_back(e.docid); - } else { - result.push_back(e.docid); - } - } - return result; -} - -#define NO_BACKFILL -#ifdef NO_BACKFILL -LinkList -HnswLikeNns::select_neighbors(const NearestList &neighbors, uint32_t curMax) const -{ - LinkList result; - result.reserve(curMax+1); - NearestPriQ w; - for (const auto & entry : neighbors) { - w.push(entry); - } - while (! w.empty()) { - HnswHit e = w.top(); - w.pop(); - if (haveCloserDistance(e, result)) { - continue; - } - result.push_back(e.docid); - if (result.size() == curMax) { - ++select_n_full; - return result; - } - } - ++select_n_partial; - return result; -} -#else -LinkList -HnswLikeNns::select_neighbors(const NearestList &neighbors, uint32_t curMax) const -{ - LinkList result; - result.reserve(curMax+1); - bool needFiltering = (neighbors.size() > curMax); - NearestPriQ w; - for (const auto & entry : neighbors) { - w.push(entry); - } - LinkList backfill; - while (! w.empty()) { - HnswHit e = w.top(); - w.pop(); - if (needFiltering && haveCloserDistance(e, result)) { - backfill.push_back(e.docid); - continue; - } - result.push_back(e.docid); - if (result.size() == curMax) return result; - } - if (result.size() * 4 < _M) { - for (uint32_t fill_id : backfill) { - result.push_back(fill_id); - if (result.size() * 2 >= _M) break; - } - } - return result; -} -#endif - -void -HnswLikeNns::connect_new_node(uint32_t id, const LinkList &neighbors, uint32_t level) { - LinkList &newLinks = getLinkList(id, level); - for (uint32_t neigh_id : neighbors) { - LinkList &oldLinks = getLinkList(neigh_id, level); - newLinks.push_back(neigh_id); - oldLinks.push_back(id); - } -} - -uint32_t -HnswLikeNns::count_reachable() const { - VisitedSet visited(_nodes.size()); - int level = _entryLevel; - LinkList curList; - curList.push_back(_entryId); - visited.mark(_entryId); - uint32_t idx = 0; - while (level >= 0) { - while (idx < curList.size()) { - uint32_t id = curList[idx++]; - const LinkList &links = getLinkList(id, level); - for (uint32_t n_id : links) { - if (visited.isMarked(n_id)) continue; - visited.mark(n_id); - curList.push_back(n_id); - } - } - --level; - idx = 0; - } - return curList.size(); -} - -void -HnswLikeNns::dumpStats() const { - std::vector levelCounts; - levelCounts.resize(_entryLevel + 2); - std::vector outLinkHist; - outLinkHist.resize(2 * _M + 2); - uint32_t symmetrics = 0; - uint32_t level1links = 0; - uint32_t both_l_links = 0; - fprintf(stderr, "stats for HnswLikeNns with %zu nodes, entry level = %d, entry id = %u\n", - _nodes.size(), _entryLevel, _entryId); - - for (uint32_t id = 0; id < _nodes.size(); ++id) { - const auto &node = _nodes[id]; - uint32_t levels = node._links.size(); - levelCounts[levels]++; - if (levels < 1) { - outLinkHist[0]++; - continue; - } - const LinkList &link_list = getLinkList(id, 0); - uint32_t numlinks = link_list.size(); - outLinkHist[numlinks]++; - if (numlinks < 1) { - fprintf(stderr, "node with %u links: id %u\n", numlinks, id); - } - bool all_sym = true; - for (uint32_t n_id : link_list) { - const LinkList &neigh_list = getLinkList(n_id, 0); - if (! neigh_list.has_link_to(id)) { -#ifdef KEEP_SYM - fprintf(stderr, "BAD: %u has link to neighbor %u, but backlink is missing\n", id, n_id); -#endif - all_sym = false; - } - } - if (all_sym) ++symmetrics; - if (levels < 2) continue; - const LinkList &link_list_1 = getLinkList(id, 1); - for (uint32_t n_id : link_list_1) { - ++level1links; - if (link_list.has_link_to(n_id)) ++both_l_links; - } - } - for (uint32_t l = 0; l < levelCounts.size(); ++l) { - fprintf(stderr, "Nodes on %u levels: %u\n", l, levelCounts[l]); - } - fprintf(stderr, "reachable nodes %u / %zu\n", - count_reachable(), _nodes.size() - levelCounts[0]); - fprintf(stderr, "level 1 links overlapping on l0: %u / total: %u\n", - both_l_links, level1links); - for (uint32_t l = 0; l < outLinkHist.size(); ++l) { - if (outLinkHist[l] != 0) { - fprintf(stderr, "Nodes with %u outward links on L0: %u\n", l, outLinkHist[l]); - } - } - fprintf(stderr, "Symmetric in-out nodes: %u\n", symmetrics); -} - -std::unique_ptr> -make_hnsw_nns(uint32_t numDims, const DocVectorAccess &dva) -{ - return std::make_unique(numDims, dva); -} diff --git a/eval/src/tests/ann/xp-lsh-nns.cpp b/eval/src/tests/ann/xp-lsh-nns.cpp deleted file mode 100644 index 225c1c286b0d..000000000000 --- a/eval/src/tests/ann/xp-lsh-nns.cpp +++ /dev/null @@ -1,269 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "nns.h" -#include "std-random.h" -#include -#include -#include -#include -#include -#include - -using V = std::span; - -#define NUM_HASH_WORDS 4 -#define IGNORE_BITS 32 -#define HIST_SIZE (64*NUM_HASH_WORDS + 1) - -struct LsMaskHash { - uint64_t bits[NUM_HASH_WORDS]; - uint64_t mask[NUM_HASH_WORDS]; - LsMaskHash() { - memset(bits, 0xff, sizeof bits); - memset(mask, 0xff, sizeof mask); - } -}; - -static inline int hash_dist(const LsMaskHash &h1, const LsMaskHash &h2) { - int cnt = 0; - for (size_t o = 0; o < NUM_HASH_WORDS; ++o) { - uint64_t hx = h1.bits[o] ^ h2.bits[o]; - hx &= (h1.mask[o] | h2.mask[o]); - cnt += std::popcount(hx); - } - return cnt; -} - -struct Multiplier { - std::vector multiplier; - Multiplier(size_t dims) : multiplier(dims, 0.0) {} -}; - -LsMaskHash mask_hash_from_pv(V p, std::vector rpMatrix) { - LsMaskHash result; - float transformed[NUM_HASH_WORDS][64]; - std::vector squares; - for (size_t o = 0; o < NUM_HASH_WORDS; ++o) { - uint64_t hash = 0; - for (size_t bit = 0; bit < 64; ++bit) { - hash <<= 1u; - V m = rpMatrix[bit+64*o].multiplier; - double dotproduct = l2distCalc.product(m, p); - if (dotproduct > 0.0) { - hash |= 1u; - } - double sq = dotproduct * dotproduct; - transformed[o][bit] = sq; - squares.push_back(sq); - } - result.bits[o] = hash; - } - std::sort(squares.begin(), squares.end()); - double lim = squares[IGNORE_BITS*NUM_HASH_WORDS-1]; - for (size_t o = 0; o < NUM_HASH_WORDS; ++o) { - uint64_t mask = 0; - for (size_t bit = 0; bit < 64; ++bit) { - mask <<= 1u; - if (transformed[o][bit] > lim) { - mask |= 1u; - } - } - result.mask[o] = mask; - } - return result; -} - -class RpLshNns : public NNS -{ -private: - RndGen _rndGen; - std::vector _transformationMatrix; - std::vector _generated_doc_hashes; - -public: - RpLshNns(uint32_t numDims, const DocVectorAccess &dva) - : NNS(numDims, dva), _rndGen() - { - _transformationMatrix.reserve(NUM_HASH_WORDS*64); - for (size_t i = 0; i < NUM_HASH_WORDS*64; i++) { - _transformationMatrix.emplace_back(numDims); - Multiplier &mult = _transformationMatrix.back(); - for (float &v : mult.multiplier) { - v = _rndGen.nextNormal(); - } - } - fprintf(stderr, "ignore bits for lsh: %d*%d=%d\n", - IGNORE_BITS, NUM_HASH_WORDS, IGNORE_BITS*NUM_HASH_WORDS); - _generated_doc_hashes.reserve(100000); - } - - ~RpLshNns() { - } - - void addDoc(uint32_t docid) override { - V vector = _dva.get(docid); - LsMaskHash hash = mask_hash_from_pv(vector, _transformationMatrix); - if (_generated_doc_hashes.size() == docid) { - _generated_doc_hashes.push_back(hash); - return; - } - while (_generated_doc_hashes.size() <= docid) { - _generated_doc_hashes.push_back(LsMaskHash()); - } - _generated_doc_hashes[docid] = hash; - } - void removeDoc(uint32_t docid) override { - if (_generated_doc_hashes.size() > docid) { - _generated_doc_hashes[docid] = LsMaskHash(); - } - } - std::vector topK(uint32_t k, Vector vector, uint32_t search_k) override; - std::vector topKfilter(uint32_t k, Vector vector, uint32_t search_k, const BitVector &bitvector) override; - - V getVector(uint32_t docid) const { return _dva.get(docid); } - double uniformRnd() { return _rndGen.nextUniform(); } - uint32_t dims() const { return _numDims; } -}; - - -struct LshHit { - double distance; - uint32_t docid; - int hash_distance; - LshHit() noexcept : distance(0.0), docid(0u), hash_distance(0) {} - LshHit(int id, double dist, int hd = 0) - : distance(dist), docid(id), hash_distance(hd) {} -}; - -struct LshHitComparator { - bool operator() (const LshHit &lhs, const LshHit& rhs) const { - if (lhs.distance < rhs.distance) return false; - if (lhs.distance > rhs.distance) return true; - return (lhs.docid > rhs.docid); - } -}; - -class LshHitHeap { -private: - size_t _size; - vespalib::PriorityQueue _priQ; - std::vector hd_histogram; -public: - explicit LshHitHeap(size_t maxSize) : _size(maxSize), _priQ() { - _priQ.reserve(maxSize); - } - ~LshHitHeap() {} - bool maybe_use(const LshHit &hit) { - if (_priQ.size() < _size) { - _priQ.push(hit); - uint32_t newHd = hit.hash_distance; - while (hd_histogram.size() <= newHd) { - hd_histogram.push_back(0); - } - hd_histogram[newHd]++; - } else if (hit.distance < _priQ.front().distance) { - uint32_t oldHd = _priQ.front().hash_distance; - uint32_t newHd = hit.hash_distance; - while (hd_histogram.size() <= newHd) { - hd_histogram.push_back(0); - } - hd_histogram[newHd]++; - hd_histogram[oldHd]--; - _priQ.front() = hit; - _priQ.adjust(); - return true; - } - return false; - } - int limitHashDistance() { - size_t sz = _priQ.size(); - uint32_t sum = 0; - for (uint32_t i = 0; i < hd_histogram.size(); ++i) { - sum += hd_histogram[i]; - if (sum >= ((3*sz)/4)) return i; - } - return 99999; - } - std::vector bestLshHits() { - std::vector result; - size_t sz = _priQ.size(); - result.resize(sz); - for (size_t i = sz; i-- > 0; ) { - result[i] = _priQ.front(); - _priQ.pop_front(); - } - return result; - } -}; - -std::vector -RpLshNns::topKfilter(uint32_t k, Vector vector, uint32_t search_k, const BitVector &skipDocIds) -{ - std::vector result; - result.reserve(k); - - std::vector tmp(_numDims); - std::span tmpArr(tmp); - - LsMaskHash query_hash = mask_hash_from_pv(vector, _transformationMatrix); - LshHitHeap heap(std::max(k, search_k)); - int limit_hash_dist = 99999; - size_t docidLimit = _generated_doc_hashes.size(); - for (uint32_t docid = 0; docid < docidLimit; ++docid) { - if (skipDocIds.isSet(docid)) continue; - int hd = hash_dist(query_hash, _generated_doc_hashes[docid]); - if (hd <= limit_hash_dist) { - double dist = l2distCalc.l2sq_dist(vector, _dva.get(docid), tmpArr); - LshHit h(docid, dist, hd); - if (heap.maybe_use(h)) { - limit_hash_dist = heap.limitHashDistance(); - } - } - } - std::vector best = heap.bestLshHits(); - size_t numHits = std::min((size_t)k, best.size()); - for (size_t i = 0; i < numHits; ++i) { - result.emplace_back(best[i].docid, SqDist(best[i].distance)); - } - return result; -} - -std::vector -RpLshNns::topK(uint32_t k, Vector vector, uint32_t search_k) -{ - std::vector result; - result.reserve(k); - - std::vector tmp(_numDims); - std::span tmpArr(tmp); - - LsMaskHash query_hash = mask_hash_from_pv(vector, _transformationMatrix); - LshHitHeap heap(std::max(k, search_k)); - int limit_hash_dist = 99999; - int histogram[HIST_SIZE]; - memset(histogram, 0, sizeof histogram); - size_t docidLimit = _generated_doc_hashes.size(); - for (uint32_t docid = 0; docid < docidLimit; ++docid) { - int hd = hash_dist(query_hash, _generated_doc_hashes[docid]); - histogram[hd]++; - if (hd <= limit_hash_dist) { - double dist = l2distCalc.l2sq_dist(vector, _dva.get(docid), tmpArr); - LshHit h(docid, dist, hd); - if (heap.maybe_use(h)) { - limit_hash_dist = heap.limitHashDistance(); - } - } - } - std::vector best = heap.bestLshHits(); - size_t numHits = std::min((size_t)k, best.size()); - for (size_t i = 0; i < numHits; ++i) { - result.emplace_back(best[i].docid, SqDist(best[i].distance)); - } - return result; -} - -std::unique_ptr> -make_rplsh_nns(uint32_t numDims, const DocVectorAccess &dva) -{ - return std::make_unique(numDims, dva); -}