From 6adc37239db9c66331220ff13d252bac935721e0 Mon Sep 17 00:00:00 2001 From: Henning Baldersheim Date: Wed, 21 Aug 2024 12:51:45 +0000 Subject: [PATCH] - Add test for repopulation. - Drop incomplete multichunk support. --- .../bitvector/condensedbitvector_test.cpp | 88 +++++++++++++++++++ .../vespa/searchlib/common/bitvectorcache.cpp | 51 ++++------- .../vespa/searchlib/common/bitvectorcache.h | 7 +- .../searchlib/common/condensedbitvectors.h | 1 - .../searchlib/predicate/predicate_index.cpp | 2 +- .../searchlib/predicate/predicate_index.h | 2 +- .../queryeval/predicate_blueprint.cpp | 2 +- .../searchlib/queryeval/predicate_blueprint.h | 4 +- .../searchlib/queryeval/predicate_search.cpp | 4 +- .../searchlib/queryeval/predicate_search.h | 6 +- 10 files changed, 119 insertions(+), 48 deletions(-) diff --git a/searchlib/src/tests/common/bitvector/condensedbitvector_test.cpp b/searchlib/src/tests/common/bitvector/condensedbitvector_test.cpp index 4f7a0dfa736c..decbf9d708f0 100644 --- a/searchlib/src/tests/common/bitvector/condensedbitvector_test.cpp +++ b/searchlib/src/tests/common/bitvector/condensedbitvector_test.cpp @@ -1,11 +1,15 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include #include +#include +#include #include LOG_SETUP("condensedbitvector_test"); using search::CondensedBitVector; +using search::BitVectorCache; +using search::PopulateInterface; using vespalib::GenerationHolder; TEST("Verify state after init") @@ -45,4 +49,88 @@ TEST("Verify set/get") EXPECT_EQUAL(1u, sum); } +namespace { +using DocIds = std::vector; +using KeyDocIdsMap = vespalib::hash_map; +struct DocIdIterator : public PopulateInterface::Iterator { + explicit DocIdIterator(const DocIds & docs) : _docs(docs), _index(0) {} + int32_t getNext() override { + return (_index < _docs.size()) ? _docs[_index++] : -1; + } + + const DocIds & _docs; + uint32_t _index; +}; +class Populater : public PopulateInterface { +public: + explicit Populater(const KeyDocIdsMap & m) + : _map(m), + _empty() + {} + Iterator::UP lookup(uint64_t key) const override { + return _map.contains(key) + ? std::make_unique(_map[key]) + : std::make_unique(_empty); + } +private: + const KeyDocIdsMap & _map; + DocIds _empty; +}; + +KeyDocIdsMap +create(uint32_t numDocs, uint32_t numKeys, uint32_t seed) { + KeyDocIdsMap m; + std::srand(seed); + for (uint32_t k = 0; k < numKeys; k++) { + DocIds & docIds = m[k]; + for (uint32_t d=0, count=(std::rand()%numDocs); d < count; d++) { + docIds.push_back(std::rand()%numDocs); + } + } + return m; +} + +} + +TEST("Test repopulation of bitvector cache") { + GenerationHolder genHolder; + BitVectorCache cache(genHolder); + constexpr uint32_t numDocs = 100; + std::vector countVector(numDocs); + EXPECT_TRUE(cache.lookupCachedSet({{0,5}}).empty()); + cache.populate(numDocs, Populater(create(numDocs, 1, 1))); + EXPECT_TRUE(cache.lookupCachedSet({{0,5}}).empty()); + cache.requirePopulation(); + cache.populate(numDocs, Populater(create(numDocs, 1, 1))); + auto keySet = cache.lookupCachedSet({{0,5}, {1,10}}); + EXPECT_EQUAL(1u, keySet.size()); + EXPECT_TRUE(keySet.contains(0)); + cache.computeCountVector(keySet, countVector); + + std::vector> keys; + for (uint64_t i=0; i < 10; i++) { + keys.emplace_back(i, 10+i); + } + cache.lookupCachedSet(keys); + for (size_t i = 2; i < keys.size(); i++) { + cache.requirePopulation(); + cache.populate(numDocs, Populater(create(numDocs, i, i))); + keySet = cache.lookupCachedSet(keys); + EXPECT_EQUAL(keys.size(), keySet.size()); + cache.computeCountVector(keySet, countVector); + } + + keySet = cache.lookupCachedSet(keys); + cache.computeCountVector(keySet, countVector); + cache.set(1, 7, false); + cache.computeCountVector(keySet, countVector); + uint8_t prev_7 = countVector[7]; + cache.set(1, 7, true); + cache.computeCountVector(keySet, countVector); + EXPECT_EQUAL(prev_7 + 1, countVector[7]); + cache.set(1, 7, false); + cache.computeCountVector(keySet, countVector); + EXPECT_EQUAL(prev_7, countVector[7]); +} + TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/vespa/searchlib/common/bitvectorcache.cpp b/searchlib/src/vespa/searchlib/common/bitvectorcache.cpp index e0403b54a838..8533db3272f5 100644 --- a/searchlib/src/vespa/searchlib/common/bitvectorcache.cpp +++ b/searchlib/src/vespa/searchlib/common/bitvectorcache.cpp @@ -17,7 +17,7 @@ BitVectorCache::BitVectorCache(GenerationHolder &genHolder) _needPopulation(false), _mutex(), _keys(), - _chunks(), + _chunk(), _genHolder(genHolder) { } @@ -25,21 +25,20 @@ BitVectorCache::BitVectorCache(GenerationHolder &genHolder) BitVectorCache::~BitVectorCache() = default; void -BitVectorCache::computeCountVector(KeySet & keys, CountVector & v) const +BitVectorCache::computeCountVector(KeySet & keys, std::span v) const { std::vector notFound; - std::vector keySets; - ChunkV chunks; + CondensedBitVector::KeySet keySet; + CondensedBitVector::SP chunk; { std::shared_lock guard(_mutex); - keySets.resize(_chunks.size()); auto end = _keys.end(); for (Key k : keys) { auto found = _keys.find(k); if (found != end) { const KeyMeta & m = found->second; if (m.isCached()) { - keySets[m.chunkId()].insert(m.chunkIndex()); + keySet.insert(m.chunkIndex()); } else { notFound.push_back(k); } @@ -47,21 +46,15 @@ BitVectorCache::computeCountVector(KeySet & keys, CountVector & v) const notFound.push_back(k); } } - chunks = _chunks; + chunk = _chunk; } for (Key k : notFound) { keys.erase(k); } - size_t index(0); - if (chunks.empty()) { + if (!chunk) { memset(&v[0], 0, v.size()); - } - for (const auto & chunk : chunks) { - if (index == 0) { - chunk->initializeCountVector(keySets[index++], v); - } else { - chunk->addCountVector(keySets[index++], v); - } + } else { + chunk->initializeCountVector(keySet, v); } } @@ -117,7 +110,7 @@ bool BitVectorCache::hasCostChanged(const std::shared_lock & guard) { (void) guard; - if ( ! _chunks.empty()) { + if (_chunk) { SortedKeyMeta sorted(getSorted(_keys)); double oldCached(0); for (auto & e : sorted) { @@ -127,7 +120,7 @@ BitVectorCache::hasCostChanged(const std::shared_lock & guard } } double newCached(0); - for (size_t i(0); i < sorted.size() && i < _chunks[0]->getKeyCapacity(); i++) { + for (size_t i(0); i < sorted.size() && i < _chunk->getKeyCapacity(); i++) { const KeyMeta & m = *sorted[i].second; newCached += m.cost(); } @@ -182,15 +175,15 @@ void BitVectorCache::populate(uint32_t sz, const PopulateInterface & lookup) { if (!needPopulation()) return; + CondensedBitVector::UP chunk(CondensedBitVector::create(sz, _genHolder)); std::unique_lock guard(_mutex); Key2Index newKeys(_keys); guard.unlock(); - CondensedBitVector::UP chunk(CondensedBitVector::create(sz, _genHolder)); populate(newKeys, *chunk, lookup); guard.lock(); - _chunks.push_back(std::move(chunk)); + _chunk = std::move(chunk); _keys.swap(newKeys); _needPopulation = false; } @@ -203,24 +196,17 @@ BitVectorCache::set(Key key, uint32_t index, bool v) if (found != _keys.end()) { const KeyMeta & m(found->second); if (m.isCached()) { - _chunks[m.chunkId()]->set(m.chunkIndex(), index, v); + _chunk->set(m.chunkIndex(), index, v); } } } -bool -BitVectorCache::get(Key key, uint32_t index) const -{ - (void) key; (void) index; - return false; -} - void BitVectorCache::removeIndex(uint32_t index) { std::unique_lock guard(_mutex); - for (auto & chunk : _chunks) { - chunk->clearIndex(index); + if (_chunk) { + _chunk->clearIndex(index); } } @@ -228,8 +214,9 @@ BitVectorCache::removeIndex(uint32_t index) void BitVectorCache::adjustDocIdLimit(uint32_t docId) { - for (auto &chunk : _chunks) { - chunk->adjustDocIdLimit(docId); + std::unique_lock guard(_mutex); + if (_chunk) { + _chunk->adjustDocIdLimit(docId); } } diff --git a/searchlib/src/vespa/searchlib/common/bitvectorcache.h b/searchlib/src/vespa/searchlib/common/bitvectorcache.h index cd428bc8f660..bd2c72cfcd23 100644 --- a/searchlib/src/vespa/searchlib/common/bitvectorcache.h +++ b/searchlib/src/vespa/searchlib/common/bitvectorcache.h @@ -27,15 +27,13 @@ class BitVectorCache using Key = uint64_t; using KeySet = vespalib::hash_set; using KeyAndCountSet = std::vector>; - using CountVector = CondensedBitVector::CountVector; using GenerationHolder = vespalib::GenerationHolder; explicit BitVectorCache(GenerationHolder &genHolder); ~BitVectorCache(); - void computeCountVector(KeySet & keys, CountVector & v) const; + void computeCountVector(KeySet & keys, std::span v) const; KeySet lookupCachedSet(const KeyAndCountSet & keys); void set(Key key, uint32_t index, bool v); - bool get(Key key, uint32_t index) const; void removeIndex(uint32_t index); void adjustDocIdLimit(uint32_t docId); void populate(uint32_t count, const PopulateInterface &); @@ -67,7 +65,6 @@ class BitVectorCache bool isCached() const { return _chunkId >= 0; } size_t bitCount() const { return _bitCount; } size_t chunkIndex() const { return _chunkIndex; } - size_t chunkId() const { return _chunkId; } size_t lookupCount() const { return _lookupCount.load(std::memory_order_relaxed); } KeyMeta & lookup() { _lookupCount++; return *this; } KeyMeta & bitCount(uint32_t v) { _bitCount = v; return *this; } @@ -92,7 +89,7 @@ class BitVectorCache std::atomic _needPopulation; mutable std::shared_mutex _mutex; Key2Index _keys; - ChunkV _chunks; + CondensedBitVector::SP _chunk; GenerationHolder &_genHolder; }; diff --git a/searchlib/src/vespa/searchlib/common/condensedbitvectors.h b/searchlib/src/vespa/searchlib/common/condensedbitvectors.h index 2d914163b7f3..6f182220906e 100644 --- a/searchlib/src/vespa/searchlib/common/condensedbitvectors.h +++ b/searchlib/src/vespa/searchlib/common/condensedbitvectors.h @@ -14,7 +14,6 @@ class CondensedBitVector using SP = std::shared_ptr; using Key = uint32_t; using KeySet = std::set; - using CountVector = std::span; virtual ~CondensedBitVector(); diff --git a/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp b/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp index dd9255f0b702..62c6121d85f0 100644 --- a/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp +++ b/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp @@ -283,7 +283,7 @@ PredicateIndex::lookupCachedSet(const BitVectorCache::KeyAndCountSet & keys) con } void -PredicateIndex::computeCountVector(BitVectorCache::KeySet & keys, BitVectorCache::CountVector & v) const +PredicateIndex::computeCountVector(BitVectorCache::KeySet & keys, std::span v) const { _cache.computeCountVector(keys, v); } diff --git a/searchlib/src/vespa/searchlib/predicate/predicate_index.h b/searchlib/src/vespa/searchlib/predicate/predicate_index.h index f74c652a461e..f588ebf6f6e0 100644 --- a/searchlib/src/vespa/searchlib/predicate/predicate_index.h +++ b/searchlib/src/vespa/searchlib/predicate/predicate_index.h @@ -98,7 +98,7 @@ class PredicateIndex : public PopulateInterface { void populateIfNeeded(size_t doc_id_limit); BitVectorCache::KeySet lookupCachedSet(const BitVectorCache::KeyAndCountSet & keys) const; - void computeCountVector(BitVectorCache::KeySet & keys, BitVectorCache::CountVector & v) const; + void computeCountVector(BitVectorCache::KeySet & keys, std::span v) const; /* * Adjust size of structures to have space for docId. diff --git a/searchlib/src/vespa/searchlib/queryeval/predicate_blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/predicate_blueprint.cpp index d17f9957b386..33a2d875b4bf 100644 --- a/searchlib/src/vespa/searchlib/queryeval/predicate_blueprint.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/predicate_blueprint.cpp @@ -265,7 +265,7 @@ PredicateBlueprint::fetchPostings(const ExecuteInfo &) { PredicateAttribute::MinFeatureHandle mfh = predicate_attribute().getMinFeatureVector(); Alloc kv(Alloc::alloc(mfh.second, vespalib::alloc::MemoryAllocator::HUGEPAGE_SIZE*4)); _kVBacking.swap(kv); - _kV = BitVectorCache::CountVector(static_cast(_kVBacking.get()), mfh.second); + _kV = std::span(static_cast(_kVBacking.get()), mfh.second); _index.computeCountVector(_cachedFeatures, _kV); for (const auto & entry : _bounds_dict_entries) { addBoundsPostingToK(entry.feature); diff --git a/searchlib/src/vespa/searchlib/queryeval/predicate_blueprint.h b/searchlib/src/vespa/searchlib/queryeval/predicate_blueprint.h index 977dd4ddb4f4..0d61177c67fe 100644 --- a/searchlib/src/vespa/searchlib/queryeval/predicate_blueprint.h +++ b/searchlib/src/vespa/searchlib/queryeval/predicate_blueprint.h @@ -62,7 +62,7 @@ class PredicateBlueprint : public ComplexLeafBlueprint { } // Exposed for testing - const BitVectorCache::CountVector & getKV() const { return _kV; } + std::span getKV() const { return _kV; } const BitVectorCache::KeySet & getCachedFeatures() const { return _cachedFeatures; } private: using BTreeIterator = predicate::SimpleIndex::BTreeIterator; @@ -85,7 +85,7 @@ class PredicateBlueprint : public ComplexLeafBlueprint { const PredicateAttribute & _attribute; const predicate::PredicateIndex &_index; Alloc _kVBacking; - BitVectorCache::CountVector _kV; + std::span _kV; BitVectorCache::KeySet _cachedFeatures; std::vector _interval_dict_entries; diff --git a/searchlib/src/vespa/searchlib/queryeval/predicate_search.cpp b/searchlib/src/vespa/searchlib/queryeval/predicate_search.cpp index f42414ef4fb9..3b06eaad43ca 100644 --- a/searchlib/src/vespa/searchlib/queryeval/predicate_search.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/predicate_search.cpp @@ -132,10 +132,10 @@ SkipMinFeature::create(const uint8_t * min_feature, const uint8_t * kv, size_t s PredicateSearch::PredicateSearch(const uint8_t * minFeatureVector, const IntervalRange * interval_range_vector, IntervalRange max_interval_range, - CondensedBitVector::CountVector kV, + std::span kV, vector posting_lists, const fef::TermFieldMatchDataArray &tfmda) - : _skip(SkipMinFeature::create(minFeatureVector, &kV[0], kV.size())), + : _skip(SkipMinFeature::create(minFeatureVector, kV.data(), kV.size())), _posting_lists(std::move(posting_lists)), _sorted_indexes(_posting_lists.size()), _sorted_indexes_merge_buffer(_posting_lists.size()), diff --git a/searchlib/src/vespa/searchlib/queryeval/predicate_search.h b/searchlib/src/vespa/searchlib/queryeval/predicate_search.h index 6672b297b22e..9607a99e7492 100644 --- a/searchlib/src/vespa/searchlib/queryeval/predicate_search.h +++ b/searchlib/src/vespa/searchlib/queryeval/predicate_search.h @@ -19,7 +19,7 @@ class SkipMinFeature { public: using UP = std::unique_ptr; - virtual ~SkipMinFeature() { } + virtual ~SkipMinFeature() = default; VESPA_DLL_LOCAL virtual uint32_t next() = 0; static SkipMinFeature::UP create(const uint8_t * min_feature, const uint8_t * kv, size_t sz); }; @@ -54,10 +54,10 @@ class PredicateSearch : public SearchIterator { PredicateSearch(const uint8_t * minFeature, const IntervalRange * interval_range_vector, IntervalRange max_interval_range, - CondensedBitVector::CountVector kV, + std::span kV, std::vector posting_lists, const fef::TermFieldMatchDataArray &tfmda); - ~PredicateSearch(); + ~PredicateSearch() override; void doSeek(uint32_t doc_id) override; void doUnpack(uint32_t doc_id) override;