Merge pull request #29915 from vespa-engine/havardpe/strict-flow

take strictness into account for flow/cost/sorting
vespa-engine · Jan 15, 2024 · 7ff3cdf · 7ff3cdf
2 parents ccd3109 + 7a8871c
commit 7ff3cdf
Show file tree

Hide file tree

Showing 6 changed files with 445 additions and 118 deletions.
diff --git a/searchlib/src/tests/queryeval/blueprint/blueprint_test.cpp b/searchlib/src/tests/queryeval/blueprint/blueprint_test.cpp
@@ -24,10 +24,10 @@ class MyOr : public IntermediateBlueprint
 private:
 public:
     double calculate_cost() const final {
-        return cost_of(get_children(), OrFlow());
+        return OrFlow::cost_of(get_children());
     }
     double calculate_relative_estimate() const final {
-        return estimate_of(get_children(), OrFlow());
+        return OrFlow::estimate_of(get_children());
     }
     HitEstimate combine(const std::vector<HitEstimate> &data) const override {
         return max(data);

diff --git a/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp b/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp
@@ -1380,7 +1380,7 @@ TEST("cost for ONEAR") {
 }
 
 TEST("cost for WEAKAND") {
-    verify_cost(make::WEAKAND(1000), calc_cost({{1.1, 0.8},{1.2, 0.7},{1.3, 0.5}}));
+    verify_cost(make::WEAKAND(1000), calc_cost({{1.3, 0.5},{1.2, 0.7},{1.1, 0.8}}));
 }
 
 TEST_MAIN() { TEST_DEBUG("lhs.out", "rhs.out"); TEST_RUN_ALL(); }
diff --git a/searchlib/src/tests/queryeval/flow/queryeval_flow_test.cpp b/searchlib/src/tests/queryeval/flow/queryeval_flow_test.cpp
@@ -5,44 +5,46 @@
 #include <vector>
 #include <random>
 
-using search::queryeval::AndFlow;
-using search::queryeval::OrFlow;
+constexpr size_t loop_cnt = 64;
+
+using namespace search::queryeval;
+
+struct ItemAdapter {
+    double estimate(const auto &child) const noexcept { return child.rel_est; }
+    double cost(const auto &child) const noexcept { return child.cost; }
+    double strict_cost(const auto &child) const noexcept { return child.strict_cost; }
+};
 
 struct Item {
     double rel_est;
     double cost;
-    Item(double rel_est_in, double cost_in) noexcept
-      : rel_est(rel_est_in), cost(cost_in) {}
-    static void sort_for_and(std::vector<Item> &data) {
-        std::sort(data.begin(), data.end(), [](const Item &a, const Item &b) noexcept {
-                                                return (1.0 - a.rel_est) / a.cost > (1.0 - b.rel_est) / b.cost;
-                                            });
+    double strict_cost;
+    Item(double rel_est_in, double cost_in, double strict_cost_in) noexcept
+      : rel_est(rel_est_in), cost(cost_in), strict_cost(strict_cost_in) {}
+    template <typename FLOW> static double estimate_of(std::vector<Item> &data) {
+        return FLOW::estimate_of(ItemAdapter(), data);
     }
-    static void sort_for_or(std::vector<Item> &data) {
-        std::sort(data.begin(), data.end(), [](const Item &a, const Item &b) noexcept {
-                                                return a.rel_est / a.cost > b.rel_est / b.cost;
-                                            });
+    template <typename FLOW> static void sort(std::vector<Item> &data, bool strict) {
+        FLOW::sort(ItemAdapter(), data, strict);
     }
-    static double cost_of(const std::vector<Item> &data, auto flow) {
-        double cost = 0.0;
-        for (const Item &item: data) {
-            cost += flow.flow() * item.cost;
-            flow.add(item.rel_est);
-        }
-        return cost;
+    template <typename FLOW> static double cost_of(const std::vector<Item> &data, bool strict) {
+        return FLOW::cost_of(ItemAdapter(), data, strict);
+    }
+    template <typename FLOW> static double ordered_cost_of(const std::vector<Item> &data, bool strict) {
+        return flow::ordered_cost_of(ItemAdapter(), data, FLOW(1.0, strict));
     }
-    static double cost_of_and(const std::vector<Item> &data) { return cost_of(data, AndFlow()); }
-    static double cost_of_or(const std::vector<Item> &data) { return cost_of(data, OrFlow()); }
+    auto operator <=>(const Item &rhs) const noexcept = default;
 };
 
 std::vector<Item> gen_data(size_t size) {
     static std::mt19937 gen;
-    static std::uniform_real_distribution<double> rel_est(0.1, 0.9);
-    static std::uniform_real_distribution<double> cost(1.0, 10.0);
+    static std::uniform_real_distribution<double>     rel_est(0.1,  0.9);
+    static std::uniform_real_distribution<double>        cost(1.0, 10.0);
+    static std::uniform_real_distribution<double> strict_cost(0.1,  5.0);
     std::vector<Item> result;
     result.reserve(size);
     for (size_t i = 0; i < size; ++i) {
-        result.emplace_back(rel_est(gen), cost(gen));
+        result.emplace_back(rel_est(gen), cost(gen), strict_cost(gen));
     }
     return result;
 }
@@ -80,37 +82,191 @@ TEST(FlowTest, perm_test) {
     EXPECT_EQ(seen.size(), 120);
 }
 
+template <template <typename> typename ORDER>
+void verify_ordering_is_strict_weak() {
+    auto cmp = ORDER(ItemAdapter());
+    auto input = gen_data(7);
+    input.emplace_back(0.5, 1.5, 0.5);
+    input.emplace_back(0.5, 1.5, 0.5);
+    input.emplace_back(0.5, 1.5, 0.5);
+    input.emplace_back(0.0, 1.5, 0.5);
+    input.emplace_back(0.0, 1.5, 0.5);
+    input.emplace_back(0.5, 0.0, 0.5);
+    input.emplace_back(0.5, 0.0, 0.5);
+    input.emplace_back(0.5, 1.5, 0.0);
+    input.emplace_back(0.5, 1.5, 0.0);
+    input.emplace_back(0.0, 0.0, 0.0);
+    input.emplace_back(0.0, 0.0, 0.0);
+    std::vector<Item> output;
+    for (const Item &in: input) {
+        EXPECT_FALSE(cmp(in, in)); // Irreflexivity
+        size_t out_idx = 0;
+        bool lower = false;
+        bool upper = false;
+        for (const Item &out: output) {
+            if (cmp(out, in)) {
+                EXPECT_FALSE(cmp(in, out)); // Antisymmetry
+                EXPECT_FALSE(lower); // Transitivity
+                EXPECT_FALSE(upper); // Transitivity
+                ++out_idx;
+            } else {
+                lower = true;
+                if (cmp(in, out)) {
+                    upper = true;
+                } else {
+                    EXPECT_FALSE(upper); // Transitivity
+                }
+            }
+        }
+        output.insert(output.begin() + out_idx, in);
+    }
+}
+
+TEST(FlowTest, and_ordering_is_strict_weak) {
+    verify_ordering_is_strict_weak<flow::MinAndCost>();
+}
+
+TEST(FlowTest, or_ordering_is_strict_weak) {
+    verify_ordering_is_strict_weak<flow::MinOrCost>();
+}
+
+TEST(FlowTest, strict_or_ordering_is_strict_weak) {
+    verify_ordering_is_strict_weak<flow::MinOrStrictCost>();
+}
+
+struct ExpectFlow {
+    double flow;
+    double est;
+    bool strict;
+};
+
+void verify_flow(auto flow, const std::vector<double> &est_list, const std::vector<ExpectFlow> &expect) {
+    ASSERT_EQ(est_list.size() + 1, expect.size());
+    for (size_t i = 0; i < expect.size(); ++i) {
+        EXPECT_DOUBLE_EQ(flow.flow(), expect[i].flow);
+        EXPECT_DOUBLE_EQ(flow.estimate(), expect[i].est);
+        EXPECT_EQ(flow.strict(), expect[i].strict);
+        if (i < est_list.size()) {
+            flow.add(est_list[i]);
+        }
+    }
+}
+
+TEST(FlowTest, basic_and_flow) {
+    for (double in: {1.0, 0.5, 0.25}) {
+        for (bool strict: {false, true}) {
+            verify_flow(AndFlow(in, strict), {0.4, 0.7, 0.2},
+                        {{in, 0.0, strict},
+                         {in*0.4, in*0.4, false},
+                         {in*0.4*0.7, in*0.4*0.7, false},
+                         {in*0.4*0.7*0.2, in*0.4*0.7*0.2, false}});
+        }
+    }
+}
+
+TEST(FlowTest, basic_or_flow) {
+    for (double in: {1.0, 0.5, 0.25}) {
+        for (bool strict: {false, true}) {
+            verify_flow(OrFlow(in, strict), {0.4, 0.7, 0.2},
+                        {{in, 0.0, strict},
+                         {in*0.6, 1.0-in*0.6, strict},
+                         {in*0.6*0.3, 1.0-in*0.6*0.3, strict},
+                         {in*0.6*0.3*0.8, 1.0-in*0.6*0.3*0.8, strict}});
+        }
+    }
+}
+
+TEST(FlowTest, basic_and_not_flow) {
+    for (double in: {1.0, 0.5, 0.25}) {
+        for (bool strict: {false, true}) {
+            verify_flow(AndNotFlow(in, strict), {0.4, 0.7, 0.2},
+                        {{in, 0.0, strict},
+                         {in*0.4, in*0.4, false},
+                         {in*0.4*0.3, in*0.4*0.3, false},
+                         {in*0.4*0.3*0.8, in*0.4*0.3*0.8, false}});
+        }
+    }
+}
+
+TEST(FlowTest, flow_cost) {
+    std::vector<Item> data = {{0.4, 1.1, 0.6}, {0.7, 1.2, 0.5}, {0.2, 1.3, 0.4}};
+    EXPECT_DOUBLE_EQ(Item::ordered_cost_of<AndFlow>(data, false), 1.1 + 0.4*1.2 + 0.4*0.7*1.3);
+    EXPECT_DOUBLE_EQ(Item::ordered_cost_of<AndFlow>(data, true), 0.6 + 0.4*1.2 + 0.4*0.7*1.3);
+    EXPECT_DOUBLE_EQ(Item::ordered_cost_of<OrFlow>(data, false), 1.1 + 0.6*1.2 + 0.6*0.3*1.3);
+    EXPECT_DOUBLE_EQ(Item::ordered_cost_of<OrFlow>(data, true), 0.6 + 0.6*0.5 + 0.6*0.3*0.4);
+    EXPECT_DOUBLE_EQ(Item::ordered_cost_of<AndNotFlow>(data, false), 1.1 + 0.4*1.2 + 0.4*0.3*1.3);
+    EXPECT_DOUBLE_EQ(Item::ordered_cost_of<AndNotFlow>(data, true), 0.6 + 0.4*1.2 + 0.4*0.3*1.3);
+}
+
 TEST(FlowTest, optimal_and_flow) {
-    for (size_t i = 0; i < 256; ++i) {
-        auto data = gen_data(7);
-        Item::sort_for_and(data);
-        double min_cost = Item::cost_of_and(data);
-        double max_cost = 0.0;
-        auto check = [min_cost,&max_cost](const std::vector<Item> &my_data) noexcept {
-                         double my_cost = Item::cost_of_and(my_data);
-                         EXPECT_LE(min_cost, my_cost);
-                         max_cost = std::max(max_cost, my_cost);
-                     };
-        each_perm(data, check);
-        fprintf(stderr, "  and cost(%zu): min: %g, max: %g, factor: %g\n",
-                i, min_cost, max_cost, max_cost / min_cost);
+    for (size_t i = 0; i < loop_cnt; ++i) {
+        for (bool strict: {false, true}) {
+            auto data = gen_data(7);
+            double ref_est = Item::estimate_of<AndFlow>(data);
+            double min_cost = Item::cost_of<AndFlow>(data, strict);
+            double max_cost = 0.0;
+            Item::sort<AndFlow>(data, strict);
+            EXPECT_EQ(Item::ordered_cost_of<AndFlow>(data, strict), min_cost);
+            auto check = [&](const std::vector<Item> &my_data) noexcept {
+                             double my_cost = Item::ordered_cost_of<AndFlow>(my_data, strict);
+                             EXPECT_LE(min_cost, my_cost);
+                             max_cost = std::max(max_cost, my_cost);
+                         };
+            each_perm(data, check);
+            if (loop_cnt < 1024 || i % 1024 == 0) {
+                fprintf(stderr, "  AND cost(%zu,%s): min: %g, max: %g, factor: %g\n",
+                        i, strict ? "strict" : "non-strict", min_cost, max_cost, max_cost / min_cost);
+            }
+            EXPECT_NEAR(ref_est, Item::estimate_of<AndFlow>(data), 1e-9);
+        }
     }
 }
 
 TEST(FlowTest, optimal_or_flow) {
-    for (size_t i = 0; i < 256; ++i) {
-        auto data = gen_data(7);
-        Item::sort_for_or(data);
-        double min_cost = Item::cost_of_or(data);
-        double max_cost = 0.0;
-        auto check = [min_cost,&max_cost](const std::vector<Item> &my_data) noexcept {
-                         double my_cost = Item::cost_of_or(my_data);
-                         EXPECT_LE(min_cost, my_cost);
-                         max_cost = std::max(max_cost, my_cost);
-                     };
-        each_perm(data, check);
-        fprintf(stderr, "  or cost(%zu): min: %g, max: %g, factor: %g\n",
-                i, min_cost, max_cost, max_cost / min_cost);
+    for (size_t i = 0; i < loop_cnt; ++i) {
+        for (bool strict: {false, true}) {
+            auto data = gen_data(7);
+            double min_cost = Item::cost_of<OrFlow>(data, strict);
+            double max_cost = 0.0;
+            Item::sort<OrFlow>(data, strict);
+            EXPECT_EQ(Item::ordered_cost_of<OrFlow>(data, strict), min_cost);
+            auto check = [&](const std::vector<Item> &my_data) noexcept {
+                             double my_cost = Item::ordered_cost_of<OrFlow>(my_data, strict);
+                             EXPECT_LE(min_cost, my_cost);
+                             max_cost = std::max(max_cost, my_cost);
+                         };
+            each_perm(data, check);
+            if (loop_cnt < 1024 || i % 1024 == 0) {
+                fprintf(stderr, "  OR cost(%zu,%s): min: %g, max: %g, factor: %g\n",
+                        i, strict ? "strict" : "non-strict", min_cost, max_cost, max_cost / min_cost);
+            }
+        }
+    }
+}
+
+TEST(FlowTest, optimal_and_not_flow) {
+    for (size_t i = 0; i < loop_cnt; ++i) {
+        for (bool strict: {false, true}) {
+            auto data = gen_data(7);
+            Item first = data[0];
+            double min_cost = Item::cost_of<AndNotFlow>(data, strict);
+            double max_cost = 0.0;
+            Item::sort<AndNotFlow>(data, strict);
+            EXPECT_EQ(data[0], first);
+            EXPECT_EQ(Item::ordered_cost_of<AndNotFlow>(data, strict), min_cost);
+            auto check = [&](const std::vector<Item> &my_data) noexcept {
+                             if (my_data[0] == first) {
+                                 double my_cost = Item::ordered_cost_of<AndNotFlow>(my_data, strict);
+                                 EXPECT_LE(min_cost, my_cost);
+                                 max_cost = std::max(max_cost, my_cost);
+                             }
+                         };
+            each_perm(data, check);
+            if (loop_cnt < 1024 || i % 1024 == 0) {
+                fprintf(stderr, "  ANDNOT cost(%zu,%s): min: %g, max: %g, factor: %g\n",
+                        i, strict ? "strict" : "non-strict", min_cost, max_cost, max_cost / min_cost);
+            }
+        }
     }
 }
 

diff --git a/searchlib/src/vespa/searchlib/queryeval/blueprint.h b/searchlib/src/vespa/searchlib/queryeval/blueprint.h
@@ -144,22 +144,6 @@ class Blueprint
         return (total_docs == 0) ? 0.0 : double(est) / double(total_docs);
     }
 
-    static double cost_of(const Children &children, auto flow) {
-        double cost = 0.0;
-        for (const auto &child: children) {
-            cost += flow.flow() * child->cost();
-            flow.add(child->estimate());
-        }
-        return cost;
-    }
-
-    static double estimate_of(const Children &children, auto flow) {
-        for (const auto &child: children) {
-            flow.add(child->estimate());
-        }
-        return flow.estimate();
-    }
-
     // utility that just takes maximum estimate
     static HitEstimate max(const std::vector<HitEstimate> &data);
 
@@ -172,20 +156,6 @@ class Blueprint
     // lower limit for docid_limit: max child estimate
     static HitEstimate sat_sum(const std::vector<HitEstimate> &data, uint32_t docid_limit);
 
-    // sort children to minimize total cost of OR flow
-    struct MinimalOrCost {
-        bool operator () (const auto &a, const auto &b) const noexcept {
-            return a->estimate() / a->cost() > b->estimate() / b->cost();
-        }
-    };
-
-    // sort children to minimize total cost of AND flow
-    struct MinimalAndCost {
-        bool operator () (const auto &a, const auto &b) const noexcept {
-            return (1.0 - a->estimate()) / a->cost() > (1.0 - b->estimate()) / b->cost();
-        }
-    };
-
     // utility to get the greater estimate to sort first, higher tiers last
     struct TieredGreaterEstimate {
         bool operator () (const auto &a, const auto &b) const noexcept {