Skip to content

Commit

Permalink
Merge pull request #29915 from vespa-engine/havardpe/strict-flow
Browse files Browse the repository at this point in the history
take strictness into account for flow/cost/sorting
  • Loading branch information
baldersheim authored Jan 15, 2024
2 parents ccd3109 + 7a8871c commit 7ff3cdf
Show file tree
Hide file tree
Showing 6 changed files with 445 additions and 118 deletions.
4 changes: 2 additions & 2 deletions searchlib/src/tests/queryeval/blueprint/blueprint_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@ class MyOr : public IntermediateBlueprint
private:
public:
double calculate_cost() const final {
return cost_of(get_children(), OrFlow());
return OrFlow::cost_of(get_children());
}
double calculate_relative_estimate() const final {
return estimate_of(get_children(), OrFlow());
return OrFlow::estimate_of(get_children());
}
HitEstimate combine(const std::vector<HitEstimate> &data) const override {
return max(data);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1380,7 +1380,7 @@ TEST("cost for ONEAR") {
}

TEST("cost for WEAKAND") {
verify_cost(make::WEAKAND(1000), calc_cost({{1.1, 0.8},{1.2, 0.7},{1.3, 0.5}}));
verify_cost(make::WEAKAND(1000), calc_cost({{1.3, 0.5},{1.2, 0.7},{1.1, 0.8}}));
}

TEST_MAIN() { TEST_DEBUG("lhs.out", "rhs.out"); TEST_RUN_ALL(); }
256 changes: 206 additions & 50 deletions searchlib/src/tests/queryeval/flow/queryeval_flow_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,44 +5,46 @@
#include <vector>
#include <random>

using search::queryeval::AndFlow;
using search::queryeval::OrFlow;
constexpr size_t loop_cnt = 64;

using namespace search::queryeval;

struct ItemAdapter {
double estimate(const auto &child) const noexcept { return child.rel_est; }
double cost(const auto &child) const noexcept { return child.cost; }
double strict_cost(const auto &child) const noexcept { return child.strict_cost; }
};

struct Item {
double rel_est;
double cost;
Item(double rel_est_in, double cost_in) noexcept
: rel_est(rel_est_in), cost(cost_in) {}
static void sort_for_and(std::vector<Item> &data) {
std::sort(data.begin(), data.end(), [](const Item &a, const Item &b) noexcept {
return (1.0 - a.rel_est) / a.cost > (1.0 - b.rel_est) / b.cost;
});
double strict_cost;
Item(double rel_est_in, double cost_in, double strict_cost_in) noexcept
: rel_est(rel_est_in), cost(cost_in), strict_cost(strict_cost_in) {}
template <typename FLOW> static double estimate_of(std::vector<Item> &data) {
return FLOW::estimate_of(ItemAdapter(), data);
}
static void sort_for_or(std::vector<Item> &data) {
std::sort(data.begin(), data.end(), [](const Item &a, const Item &b) noexcept {
return a.rel_est / a.cost > b.rel_est / b.cost;
});
template <typename FLOW> static void sort(std::vector<Item> &data, bool strict) {
FLOW::sort(ItemAdapter(), data, strict);
}
static double cost_of(const std::vector<Item> &data, auto flow) {
double cost = 0.0;
for (const Item &item: data) {
cost += flow.flow() * item.cost;
flow.add(item.rel_est);
}
return cost;
template <typename FLOW> static double cost_of(const std::vector<Item> &data, bool strict) {
return FLOW::cost_of(ItemAdapter(), data, strict);
}
template <typename FLOW> static double ordered_cost_of(const std::vector<Item> &data, bool strict) {
return flow::ordered_cost_of(ItemAdapter(), data, FLOW(1.0, strict));
}
static double cost_of_and(const std::vector<Item> &data) { return cost_of(data, AndFlow()); }
static double cost_of_or(const std::vector<Item> &data) { return cost_of(data, OrFlow()); }
auto operator <=>(const Item &rhs) const noexcept = default;
};

std::vector<Item> gen_data(size_t size) {
static std::mt19937 gen;
static std::uniform_real_distribution<double> rel_est(0.1, 0.9);
static std::uniform_real_distribution<double> cost(1.0, 10.0);
static std::uniform_real_distribution<double> rel_est(0.1, 0.9);
static std::uniform_real_distribution<double> cost(1.0, 10.0);
static std::uniform_real_distribution<double> strict_cost(0.1, 5.0);
std::vector<Item> result;
result.reserve(size);
for (size_t i = 0; i < size; ++i) {
result.emplace_back(rel_est(gen), cost(gen));
result.emplace_back(rel_est(gen), cost(gen), strict_cost(gen));
}
return result;
}
Expand Down Expand Up @@ -80,37 +82,191 @@ TEST(FlowTest, perm_test) {
EXPECT_EQ(seen.size(), 120);
}

template <template <typename> typename ORDER>
void verify_ordering_is_strict_weak() {
auto cmp = ORDER(ItemAdapter());
auto input = gen_data(7);
input.emplace_back(0.5, 1.5, 0.5);
input.emplace_back(0.5, 1.5, 0.5);
input.emplace_back(0.5, 1.5, 0.5);
input.emplace_back(0.0, 1.5, 0.5);
input.emplace_back(0.0, 1.5, 0.5);
input.emplace_back(0.5, 0.0, 0.5);
input.emplace_back(0.5, 0.0, 0.5);
input.emplace_back(0.5, 1.5, 0.0);
input.emplace_back(0.5, 1.5, 0.0);
input.emplace_back(0.0, 0.0, 0.0);
input.emplace_back(0.0, 0.0, 0.0);
std::vector<Item> output;
for (const Item &in: input) {
EXPECT_FALSE(cmp(in, in)); // Irreflexivity
size_t out_idx = 0;
bool lower = false;
bool upper = false;
for (const Item &out: output) {
if (cmp(out, in)) {
EXPECT_FALSE(cmp(in, out)); // Antisymmetry
EXPECT_FALSE(lower); // Transitivity
EXPECT_FALSE(upper); // Transitivity
++out_idx;
} else {
lower = true;
if (cmp(in, out)) {
upper = true;
} else {
EXPECT_FALSE(upper); // Transitivity
}
}
}
output.insert(output.begin() + out_idx, in);
}
}

TEST(FlowTest, and_ordering_is_strict_weak) {
verify_ordering_is_strict_weak<flow::MinAndCost>();
}

TEST(FlowTest, or_ordering_is_strict_weak) {
verify_ordering_is_strict_weak<flow::MinOrCost>();
}

TEST(FlowTest, strict_or_ordering_is_strict_weak) {
verify_ordering_is_strict_weak<flow::MinOrStrictCost>();
}

struct ExpectFlow {
double flow;
double est;
bool strict;
};

void verify_flow(auto flow, const std::vector<double> &est_list, const std::vector<ExpectFlow> &expect) {
ASSERT_EQ(est_list.size() + 1, expect.size());
for (size_t i = 0; i < expect.size(); ++i) {
EXPECT_DOUBLE_EQ(flow.flow(), expect[i].flow);
EXPECT_DOUBLE_EQ(flow.estimate(), expect[i].est);
EXPECT_EQ(flow.strict(), expect[i].strict);
if (i < est_list.size()) {
flow.add(est_list[i]);
}
}
}

TEST(FlowTest, basic_and_flow) {
for (double in: {1.0, 0.5, 0.25}) {
for (bool strict: {false, true}) {
verify_flow(AndFlow(in, strict), {0.4, 0.7, 0.2},
{{in, 0.0, strict},
{in*0.4, in*0.4, false},
{in*0.4*0.7, in*0.4*0.7, false},
{in*0.4*0.7*0.2, in*0.4*0.7*0.2, false}});
}
}
}

TEST(FlowTest, basic_or_flow) {
for (double in: {1.0, 0.5, 0.25}) {
for (bool strict: {false, true}) {
verify_flow(OrFlow(in, strict), {0.4, 0.7, 0.2},
{{in, 0.0, strict},
{in*0.6, 1.0-in*0.6, strict},
{in*0.6*0.3, 1.0-in*0.6*0.3, strict},
{in*0.6*0.3*0.8, 1.0-in*0.6*0.3*0.8, strict}});
}
}
}

TEST(FlowTest, basic_and_not_flow) {
for (double in: {1.0, 0.5, 0.25}) {
for (bool strict: {false, true}) {
verify_flow(AndNotFlow(in, strict), {0.4, 0.7, 0.2},
{{in, 0.0, strict},
{in*0.4, in*0.4, false},
{in*0.4*0.3, in*0.4*0.3, false},
{in*0.4*0.3*0.8, in*0.4*0.3*0.8, false}});
}
}
}

TEST(FlowTest, flow_cost) {
std::vector<Item> data = {{0.4, 1.1, 0.6}, {0.7, 1.2, 0.5}, {0.2, 1.3, 0.4}};
EXPECT_DOUBLE_EQ(Item::ordered_cost_of<AndFlow>(data, false), 1.1 + 0.4*1.2 + 0.4*0.7*1.3);
EXPECT_DOUBLE_EQ(Item::ordered_cost_of<AndFlow>(data, true), 0.6 + 0.4*1.2 + 0.4*0.7*1.3);
EXPECT_DOUBLE_EQ(Item::ordered_cost_of<OrFlow>(data, false), 1.1 + 0.6*1.2 + 0.6*0.3*1.3);
EXPECT_DOUBLE_EQ(Item::ordered_cost_of<OrFlow>(data, true), 0.6 + 0.6*0.5 + 0.6*0.3*0.4);
EXPECT_DOUBLE_EQ(Item::ordered_cost_of<AndNotFlow>(data, false), 1.1 + 0.4*1.2 + 0.4*0.3*1.3);
EXPECT_DOUBLE_EQ(Item::ordered_cost_of<AndNotFlow>(data, true), 0.6 + 0.4*1.2 + 0.4*0.3*1.3);
}

TEST(FlowTest, optimal_and_flow) {
for (size_t i = 0; i < 256; ++i) {
auto data = gen_data(7);
Item::sort_for_and(data);
double min_cost = Item::cost_of_and(data);
double max_cost = 0.0;
auto check = [min_cost,&max_cost](const std::vector<Item> &my_data) noexcept {
double my_cost = Item::cost_of_and(my_data);
EXPECT_LE(min_cost, my_cost);
max_cost = std::max(max_cost, my_cost);
};
each_perm(data, check);
fprintf(stderr, " and cost(%zu): min: %g, max: %g, factor: %g\n",
i, min_cost, max_cost, max_cost / min_cost);
for (size_t i = 0; i < loop_cnt; ++i) {
for (bool strict: {false, true}) {
auto data = gen_data(7);
double ref_est = Item::estimate_of<AndFlow>(data);
double min_cost = Item::cost_of<AndFlow>(data, strict);
double max_cost = 0.0;
Item::sort<AndFlow>(data, strict);
EXPECT_EQ(Item::ordered_cost_of<AndFlow>(data, strict), min_cost);
auto check = [&](const std::vector<Item> &my_data) noexcept {
double my_cost = Item::ordered_cost_of<AndFlow>(my_data, strict);
EXPECT_LE(min_cost, my_cost);
max_cost = std::max(max_cost, my_cost);
};
each_perm(data, check);
if (loop_cnt < 1024 || i % 1024 == 0) {
fprintf(stderr, " AND cost(%zu,%s): min: %g, max: %g, factor: %g\n",
i, strict ? "strict" : "non-strict", min_cost, max_cost, max_cost / min_cost);
}
EXPECT_NEAR(ref_est, Item::estimate_of<AndFlow>(data), 1e-9);
}
}
}

TEST(FlowTest, optimal_or_flow) {
for (size_t i = 0; i < 256; ++i) {
auto data = gen_data(7);
Item::sort_for_or(data);
double min_cost = Item::cost_of_or(data);
double max_cost = 0.0;
auto check = [min_cost,&max_cost](const std::vector<Item> &my_data) noexcept {
double my_cost = Item::cost_of_or(my_data);
EXPECT_LE(min_cost, my_cost);
max_cost = std::max(max_cost, my_cost);
};
each_perm(data, check);
fprintf(stderr, " or cost(%zu): min: %g, max: %g, factor: %g\n",
i, min_cost, max_cost, max_cost / min_cost);
for (size_t i = 0; i < loop_cnt; ++i) {
for (bool strict: {false, true}) {
auto data = gen_data(7);
double min_cost = Item::cost_of<OrFlow>(data, strict);
double max_cost = 0.0;
Item::sort<OrFlow>(data, strict);
EXPECT_EQ(Item::ordered_cost_of<OrFlow>(data, strict), min_cost);
auto check = [&](const std::vector<Item> &my_data) noexcept {
double my_cost = Item::ordered_cost_of<OrFlow>(my_data, strict);
EXPECT_LE(min_cost, my_cost);
max_cost = std::max(max_cost, my_cost);
};
each_perm(data, check);
if (loop_cnt < 1024 || i % 1024 == 0) {
fprintf(stderr, " OR cost(%zu,%s): min: %g, max: %g, factor: %g\n",
i, strict ? "strict" : "non-strict", min_cost, max_cost, max_cost / min_cost);
}
}
}
}

TEST(FlowTest, optimal_and_not_flow) {
for (size_t i = 0; i < loop_cnt; ++i) {
for (bool strict: {false, true}) {
auto data = gen_data(7);
Item first = data[0];
double min_cost = Item::cost_of<AndNotFlow>(data, strict);
double max_cost = 0.0;
Item::sort<AndNotFlow>(data, strict);
EXPECT_EQ(data[0], first);
EXPECT_EQ(Item::ordered_cost_of<AndNotFlow>(data, strict), min_cost);
auto check = [&](const std::vector<Item> &my_data) noexcept {
if (my_data[0] == first) {
double my_cost = Item::ordered_cost_of<AndNotFlow>(my_data, strict);
EXPECT_LE(min_cost, my_cost);
max_cost = std::max(max_cost, my_cost);
}
};
each_perm(data, check);
if (loop_cnt < 1024 || i % 1024 == 0) {
fprintf(stderr, " ANDNOT cost(%zu,%s): min: %g, max: %g, factor: %g\n",
i, strict ? "strict" : "non-strict", min_cost, max_cost, max_cost / min_cost);
}
}
}
}

Expand Down
30 changes: 0 additions & 30 deletions searchlib/src/vespa/searchlib/queryeval/blueprint.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,22 +144,6 @@ class Blueprint
return (total_docs == 0) ? 0.0 : double(est) / double(total_docs);
}

static double cost_of(const Children &children, auto flow) {
double cost = 0.0;
for (const auto &child: children) {
cost += flow.flow() * child->cost();
flow.add(child->estimate());
}
return cost;
}

static double estimate_of(const Children &children, auto flow) {
for (const auto &child: children) {
flow.add(child->estimate());
}
return flow.estimate();
}

// utility that just takes maximum estimate
static HitEstimate max(const std::vector<HitEstimate> &data);

Expand All @@ -172,20 +156,6 @@ class Blueprint
// lower limit for docid_limit: max child estimate
static HitEstimate sat_sum(const std::vector<HitEstimate> &data, uint32_t docid_limit);

// sort children to minimize total cost of OR flow
struct MinimalOrCost {
bool operator () (const auto &a, const auto &b) const noexcept {
return a->estimate() / a->cost() > b->estimate() / b->cost();
}
};

// sort children to minimize total cost of AND flow
struct MinimalAndCost {
bool operator () (const auto &a, const auto &b) const noexcept {
return (1.0 - a->estimate()) / a->cost() > (1.0 - b->estimate()) / b->cost();
}
};

// utility to get the greater estimate to sort first, higher tiers last
struct TieredGreaterEstimate {
bool operator () (const auto &a, const auto &b) const noexcept {
Expand Down
Loading

0 comments on commit 7ff3cdf

Please sign in to comment.