From 6b9c1494e06c71a9fa31f3a8baad00c00b180660 Mon Sep 17 00:00:00 2001
From: Arne Juul <arnej@vespa.ai>
Date: Tue, 19 Nov 2024 12:30:16 +0000
Subject: [PATCH] add progress reporting when rebuilding HNSW index

---
 .../tensor/tensor_attribute_loader.cpp        | 54 ++++++++++++++++++-
 .../tensor/tensor_attribute_loader.h          |  4 +-
 2 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp
index 4d862b9b76a4..70197fefffdd 100644
--- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp
@@ -13,10 +13,12 @@
 #include <vespa/searchlib/attribute/load_utils.h>
 #include <vespa/searchlib/attribute/readerbase.h>
 #include <vespa/searchlib/util/disk_space_calculator.h>
+#include <vespa/vespalib/objects/nbostream.h>
 #include <vespa/vespalib/util/arrayqueue.hpp>
 #include <vespa/vespalib/util/cpu_usage.h>
+#include <vespa/vespalib/util/jsonwriter.h>
 #include <vespa/vespalib/util/lambdatask.h>
-#include <vespa/vespalib/objects/nbostream.h>
+#include <vespa/vespalib/util/time.h>
 #include <condition_variable>
 #include <filesystem>
 #include <mutex>
@@ -41,12 +43,15 @@ bool
 can_use_index_save_file(const search::attribute::Config &config, const AttributeHeader& header)
 {
     if (!config.hnsw_index_params().has_value() || !header.get_hnsw_index_params().has_value()) {
+        LOG(warning, "Cannot use saved HNSW index for ANN, missing index parameters");
         return false;
     }
     const auto &config_params = config.hnsw_index_params().value();
     const auto &header_params = header.get_hnsw_index_params().value();
     if ((config_params.max_links_per_node() != header_params.max_links_per_node()) ||
-        (config_params.distance_metric() != header_params.distance_metric())) {
+        (config_params.distance_metric() != header_params.distance_metric()))
+    {
+        LOG(warning, "Cannot use saved HNSW index for ANN, index parameters have changed");
         return false;
     }
     return true;
@@ -258,16 +263,28 @@ TensorAttributeLoader::build_index(vespalib::Executor* executor, uint32_t docid_
     std::unique_ptr<IndexBuilder> builder;
     if (executor != nullptr) {
         builder = std::make_unique<ThreadedIndexBuilder>(_attr, _generation_handler, _store, *_index, *executor);
+        log_event("hnsw.index.rebuild.start", "execution", "multi-threaded");
     } else {
         builder = std::make_unique<ForegroundIndexBuilder>(_attr, *_index);
+        log_event("hnsw.index.rebuild.start", "execution", "single-threaded");
     }
+    constexpr vespalib::duration report_interval = 60s;
+    auto beforeStamp = vespalib::steady_clock::now();
+    auto last_report = beforeStamp;
     for (uint32_t lid = 0; lid < docid_limit; ++lid) {
         auto ref = _ref_vector[lid].load_relaxed();
         if (ref.valid()) {
             builder->add(lid);
+            auto now = vespalib::steady_clock::now();
+            if (last_report + report_interval < now) {
+                log_event("hnsw.index.rebuild.progress", "percent", (lid * 100.0 / docid_limit));
+                last_report = now;
+            }
         }
     }
     builder->wait_complete();
+    vespalib::duration elapsedTime = vespalib::steady_clock::now() - beforeStamp;
+    log_event("hnsw.index.rebuild.complete", "time.elapsed.ms", vespalib::count_ms(elapsedTime));
     _attr.commit();
 }
 
@@ -357,4 +374,37 @@ TensorAttributeLoader::check_consistency(uint32_t docid_limit)
         inconsistencies, _attr.getName().c_str(), elapsed);
 }
 
+
+namespace {
+struct EventValue {
+    vespalib::JSONStringer jstr;
+    EventValue(const TensorAttribute& attr) : jstr() {
+        jstr.beginObject();
+        jstr.appendKey("name").appendString(attr.getName());
+    }
+    void addKV(const char* key, const char* value) { jstr.appendKey(key).appendString(value); }
+    void addKV(const char* key, double value) { jstr.appendKey(key).appendDouble(value); }
+    const char* message() {
+        jstr.endObject();
+        return jstr.str().c_str();
+     }
+};
+} // namespace
+
+void TensorAttributeLoader::log_event(const char* eventName) {
+    EV_STATE(eventName, EventValue(_attr).message());
+}
+
+void TensorAttributeLoader::log_event(const char* eventName, const char* key, const char* value) {
+    EventValue ev(_attr);
+    ev.addKV(key, value);
+    EV_STATE(eventName, ev.message());
 }
+
+void TensorAttributeLoader::log_event(const char* eventName, const char* key, double value) {
+    EventValue ev(_attr);
+    ev.addKV(key, value);
+    EV_STATE(eventName, ev.message());
+}
+
+} // namespace search::tensor
diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h
index 89a07c03de5f..d65455600096 100644
--- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h
+++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h
@@ -36,6 +36,9 @@ class TensorAttributeLoader {
     bool load_index();
     uint64_t get_index_size_on_disk();
     void check_consistency(uint32_t docid_limit);
+    void log_event(const char *eventName);
+    void log_event(const char *eventName, const char *key, const char *value);
+    void log_event(const char *eventName, const char *key, double value);
 
 public:
     TensorAttributeLoader(TensorAttribute& attr, GenerationHandler& generation_handler, RefVector& ref_vector, TensorStore& store, NearestNeighborIndex* index);
@@ -44,4 +47,3 @@ class TensorAttributeLoader {
 };
 
 }
-