From 7eab50befd157778d58ba8589c3521a44ef71f48 Mon Sep 17 00:00:00 2001 From: AntonChern Date: Mon, 4 Dec 2023 17:28:44 +0300 Subject: [PATCH 1/6] Add GFD tools Added classes for working with graphs and GFDs. Graphs are presented as boost graphs. --- CMakeLists.txt | 2 +- src/core/algorithms/gfd/gfd.h | 45 ++++++++++++++++++++++ src/core/algorithms/gfd/graph_descriptor.h | 19 +++++++++ src/tests/CMakeLists.txt | 2 +- 4 files changed, 66 insertions(+), 2 deletions(-) create mode 100644 src/core/algorithms/gfd/gfd.h create mode 100644 src/core/algorithms/gfd/graph_descriptor.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 5b96b4be9b..11536b3fc5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,7 +89,7 @@ endif() # configuring boost set(Boost_USE_STATIC_LIBS OFF) -find_package(Boost 1.72.0 REQUIRED COMPONENTS container thread) +find_package(Boost 1.72.0 REQUIRED COMPONENTS container thread graph) include_directories(${Boost_INCLUDE_DIRS}) message(${Boost_INCLUDE_DIRS}) diff --git a/src/core/algorithms/gfd/gfd.h b/src/core/algorithms/gfd/gfd.h new file mode 100644 index 0000000000..07548d1372 --- /dev/null +++ b/src/core/algorithms/gfd/gfd.h @@ -0,0 +1,45 @@ +#pragma once +#include +#include + +#include "graph_descriptor.h" + +using Token = std::pair; +using Literal = std::pair; + +class Gfd { +private: + graph_t pattern_; + std::vector premises_; + std::vector conclusion_; + +public: + Gfd() = default; + + Gfd(graph_t& pattern, std::vector& premises, std::vector& conclusion) + : pattern_(pattern), premises_(premises), conclusion_(conclusion) {} + + graph_t GetPattern() const { + return pattern_; + } + + std::vector GetPremises() const { + return premises_; + } + + std::vector GetConclusion() const { + return conclusion_; + } + + void SetPattern(graph_t& pattern) { + pattern_ = pattern; + } + + void SetPremises(std::vector& premises) { + premises_ = premises; + } + + void SetConclusion(std::vector& conclusion) { + conclusion_ = conclusion; + } +}; diff --git a/src/core/algorithms/gfd/graph_descriptor.h b/src/core/algorithms/gfd/graph_descriptor.h new file mode 100644 index 0000000000..11ec3f4738 --- /dev/null +++ b/src/core/algorithms/gfd/graph_descriptor.h @@ -0,0 +1,19 @@ +#pragma once + +#include +#include + +#include + +struct Vertex { + int node_id; + std::map attributes; +}; + +struct Edge { + std::string label; +}; + +using graph_t = boost::adjacency_list; +using vertex_t = boost::graph_traits::vertex_descriptor; +using edge_t = boost::graph_traits::edge_descriptor; diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index e97f1b1fca..9d0b6e6a3e 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -9,7 +9,7 @@ add_executable(${BINARY} ${test_sources}) add_test(NAME ${BINARY} COMMAND ${BINARY}) # linking with gtest and implemented classes -target_link_libraries(${BINARY} PRIVATE ${CMAKE_PROJECT_NAME} gtest gmock) +target_link_libraries(${BINARY} PRIVATE ${CMAKE_PROJECT_NAME} gtest gmock Boost::graph) # copying sample csv's for testing add_custom_target(copy-files ALL From a7d1f3a784c23c38dd5202ed16b515f53bd1c137 Mon Sep 17 00:00:00 2001 From: AntonChern Date: Mon, 4 Dec 2023 17:31:53 +0300 Subject: [PATCH 2/6] Add graph and GFD parser The parser provides methods for writing and reading graphs and GFDs from a file. The graph is represented in text format in the DOT language. To represent GFD, lists of literals are indicated before the graph-pattern (premises are the first line, conclusion is the second) in the format index.feature=name (index1.feature1=index2.feature2) separated by a space. --- src/core/parser/graph_parser/graph_parser.cpp | 175 ++++++++++++++++++ src/core/parser/graph_parser/graph_parser.h | 28 +++ 2 files changed, 203 insertions(+) create mode 100644 src/core/parser/graph_parser/graph_parser.cpp create mode 100644 src/core/parser/graph_parser/graph_parser.h diff --git a/src/core/parser/graph_parser/graph_parser.cpp b/src/core/parser/graph_parser/graph_parser.cpp new file mode 100644 index 0000000000..12a1e9d687 --- /dev/null +++ b/src/core/parser/graph_parser/graph_parser.cpp @@ -0,0 +1,175 @@ +#include "graph_parser.h" + +#include +#include +#include +#include +#include + +namespace parser { + +namespace { + +std::vector Split(std::string str, std::string sep) { + std::vector result = {}; + if (str == "") { + return result; + } + size_t pos = 0; + while ((pos = str.find(sep)) != std::string::npos) { + result.push_back(str.substr(0, pos)); + str.erase(0, pos + sep.length()); + } + result.push_back(str); + return result; +}; + +std::vector ParseLiterals(std::istream& stream) { + std::vector result = {}; + + std::string line; + std::getline(stream, line); + boost::algorithm::trim(line); + auto tokens = Split(line, " "); + for (auto token : tokens) { + auto custom_names = Split(token, "="); + auto names1 = Split(custom_names.at(0), "."); + int index1 = names1.size() == 1 ? -1 : stoi(names1.at(0)); + std::string name1 = *(--names1.end()); + Token t1(index1, name1); + + auto names2 = Split(custom_names.at(1), "."); + int index2 = names2.size() == 1 ? -1 : stoi(names2.at(0)); + std::string name2 = *(--names2.end()); + Token t2(index2, name2); + + result.push_back(Literal(t1, t2)); + } + + return result; +}; + +void WriteLiterals(std::ostream& stream, std::vector const& literals) { + for (Literal const& l : literals) { + std::string token; + + Token fst_token = l.first; + token = fst_token.first == -1 ? "" : (std::to_string(fst_token.first) + "."); + token += fst_token.second; + stream << token; + + stream << "="; + + Token snd_token = l.second; + token = snd_token.first == -1 ? "" : (std::to_string(snd_token.first) + "."); + token += snd_token.second; + stream << token; + + stream << " "; + } + stream << std::endl; +}; + +} // namespace + +namespace graph_parser { + +using AMap = boost::property_map Vertex::*>::type; +using RMap = boost::property_map::type; + +namespace { +struct NewAttr { + using Ptr = boost::shared_ptr; + +private: + template + static Ptr MakeDyn(PMap m) { + using DM = boost::detail::dynamic_property_map_adaptor; + boost::shared_ptr sp = boost::make_shared(m); + return boost::static_pointer_cast(sp); + } + +public: + AMap attrs; + + NewAttr(AMap a) : attrs(a) {} + + Ptr operator()(std::string const& name, boost::any const& descr, boost::any const&) const { + if (typeid(vertex_t) == descr.type()) + return MakeDyn(boost::make_function_property_map( + boost::bind(*this, boost::placeholders::_1, name))); + + return Ptr(); + }; + + using result_type = std::string&; + + std::string& operator()(vertex_t v, std::string const& name) const { + return attrs[v][name]; + } +}; +} // namespace + +graph_t ReadGraph(std::istream& stream) { + graph_t result; + NewAttr newattr(get(&Vertex::attributes, result)); + boost::dynamic_properties dp(newattr); + dp.property("label", get(&Edge::label, result)); + dp.property("node_id", get(&Vertex::node_id, result)); + read_graphviz(stream, result, dp); + return result; +}; + +graph_t ReadGraph(std::filesystem::path const& path) { + std::ifstream f(path); + graph_t result = ReadGraph(f); + f.close(); + return result; +}; + +void WriteGraph(std::ostream& stream, graph_t& result) { + boost::attributes_writer vw(get(&Vertex::attributes, result)); + boost::label_writer ew(get(&Edge::label, result)); + write_graphviz(stream, result, vw, ew); +}; + +void WriteGraph(std::filesystem::path const& path, graph_t& result) { + std::ofstream f(path); + WriteGraph(f, result); + f.close(); +}; + +Gfd ReadGfd(std::istream& stream) { + std::vector premises = ParseLiterals(stream); + std::vector conclusion = ParseLiterals(stream); + graph_t pattern = ReadGraph(stream); + Gfd result = Gfd(); + result.SetPattern(pattern); + result.SetPremises(premises); + result.SetConclusion(conclusion); + return result; +}; + +Gfd ReadGfd(std::filesystem::path const& path) { + std::ifstream f(path); + Gfd result = ReadGfd(f); + f.close(); + return result; +}; + +void WriteGfd(std::ostream& stream, Gfd& result) { + WriteLiterals(stream, result.GetPremises()); + WriteLiterals(stream, result.GetConclusion()); + graph_t pattern = result.GetPattern(); + WriteGraph(stream, pattern); +}; + +void WriteGfd(std::filesystem::path const& path, Gfd& result) { + std::ofstream f(path); + WriteGfd(f, result); + f.close(); +}; + +} // namespace graph_parser + +} // namespace parser diff --git a/src/core/parser/graph_parser/graph_parser.h b/src/core/parser/graph_parser/graph_parser.h new file mode 100644 index 0000000000..b69fe24625 --- /dev/null +++ b/src/core/parser/graph_parser/graph_parser.h @@ -0,0 +1,28 @@ +#pragma once +#include +#include +#include +#include + +#include "algorithms/gfd/gfd.h" +#include "algorithms/gfd/graph_descriptor.h" + +namespace parser { + +namespace graph_parser { + +graph_t ReadGraph(std::istream& stream); +graph_t ReadGraph(std::filesystem::path const& path); + +void WriteGraph(std::ostream& stream, graph_t& result); +void WriteGraph(std::filesystem::path const& path, graph_t& result); + +Gfd ReadGfd(std::istream& stream); +Gfd ReadGfd(std::filesystem::path const& path); + +void WriteGfd(std::ostream& stream, Gfd& result); +void WriteGfd(std::filesystem::path const& path, Gfd& result); + +} // namespace graph_parser + +} // namespace parser From ba6a0a9551aa124b82d425d5ce2db48655ed3142 Mon Sep 17 00:00:00 2001 From: AntonChern Date: Mon, 4 Dec 2023 17:44:31 +0300 Subject: [PATCH 3/6] Implement baseline GFD validation algorithm Algorithm for checking if a graph functional dependency satisfies a given graph. Algorithm can be run from the console. --- src/core/algorithms/algorithm_types.h | 7 +- src/core/algorithms/algorithms.h | 3 + src/core/algorithms/gfd/balancer.cpp | 172 ++++++++++ src/core/algorithms/gfd/balancer.h | 27 ++ src/core/algorithms/gfd/gfd_handler.cpp | 61 ++++ src/core/algorithms/gfd/gfd_handler.h | 43 +++ src/core/algorithms/gfd/gfd_validation.cpp | 382 +++++++++++++++++++++ src/core/algorithms/gfd/gfd_validation.h | 27 ++ src/core/config/descriptions.h | 2 + src/core/config/names.h | 2 + 10 files changed, 724 insertions(+), 2 deletions(-) create mode 100644 src/core/algorithms/gfd/balancer.cpp create mode 100644 src/core/algorithms/gfd/balancer.h create mode 100644 src/core/algorithms/gfd/gfd_handler.cpp create mode 100644 src/core/algorithms/gfd/gfd_handler.h create mode 100644 src/core/algorithms/gfd/gfd_validation.cpp create mode 100644 src/core/algorithms/gfd/gfd_validation.h diff --git a/src/core/algorithms/algorithm_types.h b/src/core/algorithms/algorithm_types.h index 9c2fee0be1..777933dce9 100644 --- a/src/core/algorithms/algorithm_types.h +++ b/src/core/algorithms/algorithm_types.h @@ -9,7 +9,7 @@ namespace algos { using AlgorithmTypes = std::tuple; + cfd::FDFirstAlgorithm, ACAlgorithm, UCCVerifier, Faida, GfdValidation>; // clang-format off /* Enumeration of all supported non-pipeline algorithms. If you implement a new @@ -57,7 +57,10 @@ BETTER_ENUM(AlgorithmType, char, ucc_verifier, /* Inclusion dependency mining algorithms */ - faida + faida, + +/* Graph functional dependency mining algorithms */ + gfdvalid ) // clang-format on diff --git a/src/core/algorithms/algorithms.h b/src/core/algorithms/algorithms.h index a419fb1d8c..843e01381d 100644 --- a/src/core/algorithms/algorithms.h +++ b/src/core/algorithms/algorithms.h @@ -37,3 +37,6 @@ /* Inclusion dependency mining algorithms */ #include "algorithms/ind/faida/faida.h" + +/* Graph functional dependency mining algorithms */ +#include "algorithms/gfd/gfd_validation.h" diff --git a/src/core/algorithms/gfd/balancer.cpp b/src/core/algorithms/gfd/balancer.cpp new file mode 100644 index 0000000000..74c106fa06 --- /dev/null +++ b/src/core/algorithms/gfd/balancer.cpp @@ -0,0 +1,172 @@ +#include "balancer.h" + +#include +#include +#include +#include + +std::vector> Balancer::Balance(std::vector const& weights, + int const& processors_num) { + m_ = std::min(processors_num, static_cast(weights.size())); + result_ = {}; + if (weights.empty()) { + result_.resize(processors_num); + return result_; + } + for (std::size_t i = 0; i < m_; ++i) { + // the first value is index + std::vector temp = {static_cast(i)}; + result_.push_back(temp); + } + // fill processors initially + // count optimal + optimal_ = 0; + std::size_t i = 0; + for (int const& weight : weights) { + result_.at(i++).push_back(weight); + i = i == m_ ? 0 : i; + optimal_ += weight; + } + optimal_ /= m_; + // sort processors (for convenience) + for (std::vector& processor : result_) { + std::sort(processor.begin() + 1, processor.end()); + } + // ALGORITHM + DeleteLarge(); + Prepare(); + DeleteFirstSmall(); + DeleteSecondSmall(); + FullLarge(); + FullSmall(); + // delete indices + for (std::vector& processor : result_) { + processor.erase(processor.begin()); + } + for (std::size_t i = 0; i < processors_num - m_; ++i) { + std::vector empty = {}; + result_.push_back(empty); + } + return result_; +} + +void Balancer::DeleteLarge() { + deleted_large_ = {}; + for (std::vector& processor : result_) { + auto border = processor.end(); + for (auto it = --processor.end(); it != processor.begin() + 1; --it) { + if (*(it - 1) > optimal_ / 2) { + deleted_large_.push_back(*it); + border = it; + } else { + break; + } + } + processor.erase(border, processor.end()); + } +} + +void Balancer::Prepare() { + for (std::size_t i = 0; i < m_; ++i) { + quality_.emplace(i, std::tuple(0, 0, 0)); + } + for (std::vector const& processor : result_) { + auto last_small = processor.end(); + auto last = processor.end(); + if (*(--processor.end()) > optimal_ / 2) { + --last_small; + } + if (processor.begin() + 1 == last_small) { + continue; + } + int a = 0; + int b = 0; + float sum_small = std::accumulate(processor.begin() + 1, last_small, 0, std::plus()); + float sum = std::accumulate(processor.begin() + 1, last, 0, std::plus()); + while (sum_small > optimal_ / 2) { + ++a; + --last_small; + sum_small -= *last_small; + } + while (sum > optimal_) { + ++b; + --last; + sum -= *last; + } + std::get<0>(quality_.at(processor.at(0))) = a; + std::get<1>(quality_.at(processor.at(0))) = b; + std::get<2>(quality_.at(processor.at(0))) = a - b; + } +} + +void Balancer::DeleteFirstSmall() { + // sort for convenience + deleted_small_ = {}; + std::vector> small_processors = {}; + std::vector> large_processors = {}; + for (std::vector const& processor : result_) { + if (*(--processor.end()) > optimal_ / 2) { + large_processors.push_back(processor); + } else { + small_processors.push_back(processor); + } + } + auto cGreater = [this](std::vector const& a, std::vector const& b) { + return std::get<2>(quality_.at(a.at(0))) > std::get<2>(quality_.at(b.at(0))); + }; + sort(small_processors.begin(), small_processors.end(), cGreater); + sort(large_processors.begin(), large_processors.end(), cGreater); + result_.clear(); + result_.insert(result_.end(), small_processors.begin(), small_processors.end()); + result_.insert(result_.end(), large_processors.begin(), large_processors.end()); + large_procs_num_ = large_processors.size(); + std::size_t larges_num = large_processors.size() + deleted_large_.size(); + // work + border_ = larges_num < m_ ? result_.end() - larges_num : result_.begin(); + for (auto it = border_; it != result_.end(); ++it) { + auto last = it->end(); + if (*(last - 1) > optimal_ / 2) { + --last; + } + for (auto cur = last - std::get<0>(quality_.at(*it->begin())); cur != last; ++cur) { + deleted_small_.push_back(*cur); + } + it->erase(last - std::get<0>(quality_.at(*it->begin())), last); + } +} + +void Balancer::DeleteSecondSmall() { + for (auto it = result_.begin(); it != border_; ++it) { + auto last = it->end(); + for (auto cur = last - std::get<1>(quality_.at(*it->begin())); cur != last; ++cur) { + deleted_small_.push_back(*cur); + } + it->erase(last - std::get<1>(quality_.at(*it->begin())), last); + } +} + +void Balancer::PutWeight(int const& weight) { + sort(result_.begin(), result_.end(), [](std::vector const& a, std::vector const& b) { + return std::accumulate(a.begin(), a.end(), 0, std::plus()) < + std::accumulate(b.begin(), b.end(), 0, std::plus()); + }); + result_.begin()->push_back(weight); +} + +void Balancer::FullLarge() { + std::size_t i = 0; + for (int const& weight : deleted_large_) { + if (i < m_ - large_procs_num_) { + (result_.begin() + i)->push_back(weight); + } else { + PutWeight(weight); + } + ++i; + } +} + +void Balancer::FullSmall() { + for (int const& weight : deleted_small_) { + PutWeight(weight); + } +} diff --git a/src/core/algorithms/gfd/balancer.h b/src/core/algorithms/gfd/balancer.h new file mode 100644 index 0000000000..ed62103251 --- /dev/null +++ b/src/core/algorithms/gfd/balancer.h @@ -0,0 +1,27 @@ +#pragma once +#include +#include + +class Balancer { +private: + std::size_t m_; + double optimal_; + std::vector>::iterator border_; + std::vector> result_; + std::vector deleted_large_ = {}; + std::vector deleted_small_ = {}; + std::size_t large_procs_num_; + std::map> quality_; + + void DeleteLarge(); + void Prepare(); + void DeleteFirstSmall(); + void DeleteSecondSmall(); + void PutWeight(int const& weight); + void FullLarge(); + void FullSmall(); + +public: + std::vector> Balance(std::vector const& weights, + int const& processors_num); +}; diff --git a/src/core/algorithms/gfd/gfd_handler.cpp b/src/core/algorithms/gfd/gfd_handler.cpp new file mode 100644 index 0000000000..7ade54a522 --- /dev/null +++ b/src/core/algorithms/gfd/gfd_handler.cpp @@ -0,0 +1,61 @@ +#include "gfd_handler.h" + +#include +#include +#include + +#include +#include +#include +#include + +#include "balancer.h" +#include "config/equal_nulls/option.h" +#include "config/names_and_descriptions.h" +#include "config/option_using.h" +#include "config/tabular_data/input_table/option.h" +#include "config/thread_number/option.h" + +namespace algos { + +GfdHandler::GfdHandler() : Algorithm({}) { + RegisterOptions(); + MakeOptionsAvailable({config::names::kGfdData, config::names::kGraphData}); +}; + +void GfdHandler::RegisterOptions() { + using namespace config::names; + using namespace config::descriptions; + DESBORDANTE_OPTION_USING; + + RegisterOption(config::Option{&gfd_paths_, kGfdData, kDGfdData}); + RegisterOption(config::Option{&graph_path_, kGraphData, kDGraphData}); +} + +void GfdHandler::LoadDataInternal() { + std::ifstream f(graph_path_); + graph_ = parser::graph_parser::ReadGraph(f); + f.close(); + for (auto const& path : gfd_paths_) { + auto gfd_path = path; + f.open(gfd_path); + Gfd gfd = parser::graph_parser::ReadGfd(f); + f.close(); + gfds_.push_back(gfd); + } +} + +void GfdHandler::ResetState() {} + +unsigned long long GfdHandler::ExecuteInternal() { + auto start_time = std::chrono::system_clock::now(); + + result_ = GenerateSatisfiedGfds(graph_, gfds_); + + auto elapsed_milliseconds = std::chrono::duration_cast( + std::chrono::system_clock::now() - start_time); + std::cout << "Satisfied GFDs: " << result_.size() << "/" << gfds_.size() << std::endl; + return elapsed_milliseconds.count(); +} + +} // namespace algos diff --git a/src/core/algorithms/gfd/gfd_handler.h b/src/core/algorithms/gfd/gfd_handler.h new file mode 100644 index 0000000000..3d9e6acabf --- /dev/null +++ b/src/core/algorithms/gfd/gfd_handler.h @@ -0,0 +1,43 @@ +#pragma once +#include + +#include "algorithms/algorithm.h" +#include "config/names_and_descriptions.h" +#include "gfd.h" +#include "parser/graph_parser/graph_parser.h" + +namespace algos { + +class GfdHandler : public Algorithm { +protected: + std::filesystem::path graph_path_; + std::vector gfd_paths_; + + graph_t graph_; + std::vector gfds_; + std::vector result_; + + unsigned long long ExecuteInternal(); + + void ResetState() final; + void LoadDataInternal() final; + + void RegisterOptions(); + +public: + virtual std::vector GenerateSatisfiedGfds(graph_t const& graph, + std::vector const& gfds) = 0; + + GfdHandler(); + + GfdHandler(graph_t graph_, std::vector gfds_) + : Algorithm({}), graph_(graph_), gfds_(gfds_) { + ExecutePrepare(); + } + + std::vector GfdList() { + return result_; + } +}; + +} // namespace algos diff --git a/src/core/algorithms/gfd/gfd_validation.cpp b/src/core/algorithms/gfd/gfd_validation.cpp new file mode 100644 index 0000000000..934fd4f4ed --- /dev/null +++ b/src/core/algorithms/gfd/gfd_validation.cpp @@ -0,0 +1,382 @@ +#include "gfd_validation.h" + +#include +#include +#include + +#include +#include +#include +#include + +#include "balancer.h" +#include "config/equal_nulls/option.h" +#include "config/names_and_descriptions.h" +#include "config/option_using.h" +#include "config/tabular_data/input_table/option.h" +#include "config/thread_number/option.h" + +namespace { + +using namespace algos; + +std::vector> GetPartition(std::vector const& candidates, + config::ThreadNumType const& threads_num) { + std::vector> result = {}; + + int musthave = candidates.size() / threads_num; + int oversized_num = candidates.size() % threads_num; + + std::vector::const_iterator from, to; + from = candidates.begin(); + to = std::next(candidates.begin(), musthave + 1); + for (int i = 0; i < oversized_num; ++i) { + std::vector temp(from, to); + result.push_back(temp); + from = std::next(from, musthave + 1); + to = std::next(to, musthave + 1); + } + to--; + for (int i = 0; i < threads_num - oversized_num; ++i) { + std::vector temp(from, to); + result.push_back(temp); + from = std::next(from, musthave); + to = std::next(to, musthave); + } + + return result; +} + +std::vector GetCandidates(graph_t const& graph, std::string const& label) { + std::vector result = {}; + + BGL_FORALL_VERTICES_T(v, graph, graph_t) { + if (graph[v].attributes.at("label") == label) { + result.push_back(v); + } + } + + return result; +} + +vertex_t GetCenter(graph_t const& pattern, int& radius) { + using DistanceProperty = boost::exterior_vertex_property; + using DistanceMatrix = typename DistanceProperty::matrix_type; + using DistanceMatrixMap = typename DistanceProperty::matrix_map_type; + + using EccentricityProperty = boost::exterior_vertex_property; + using EccentricityContainer = typename EccentricityProperty::container_type; + using EccentricityMap = typename EccentricityProperty::map_type; + + DistanceMatrix distances(boost::num_vertices(pattern)); + DistanceMatrixMap dm(distances, pattern); + + using WeightMap = boost::constant_property_map; + + WeightMap wm(1); + boost::floyd_warshall_all_pairs_shortest_paths(pattern, dm, weight_map(wm)); + + int r, d; + EccentricityContainer eccs(boost::num_vertices(pattern)); + EccentricityMap em(eccs, pattern); + boost::tie(r, d) = all_eccentricities(pattern, dm, em); + radius = r; + + vertex_t result = 0; + typename boost::graph_traits::vertex_iterator i, end; + for (boost::tie(i, end) = vertices(pattern); i != end; ++i) { + bool is_center = true; + typename boost::graph_traits::vertex_iterator j; + for (j = vertices(pattern).first; j != end; ++j) { + if (get(get(dm, *i), *j) > r) { + is_center = false; + break; + } + } + if (is_center) { + result = pattern[*i].node_id; + break; + } + } + return result; +} + +void CalculateMessages(graph_t const& graph, std::vector const& requests, + std::map>& weighted_messages) { + for (Request const& request : requests) { + int gfd_index = std::get<0>(request); + vertex_t center = std::get<1>(request); + int radius = std::get<2>(request); + std::vector candidates = std::get<3>(request); + for (vertex_t const& candidate : candidates) { + std::set vertices = {candidate}; + std::set current = {candidate}; + for (int i = 0; i < radius; ++i) { + std::set temp = {}; + for (auto& v : current) { + typename boost::graph_traits::adjacency_iterator adjacency_it, + adjacency_end; + boost::tie(adjacency_it, adjacency_end) = boost::adjacent_vertices(v, graph); + for (; adjacency_it != adjacency_end; ++adjacency_it) { + if (vertices.find(*adjacency_it) == vertices.end()) { + vertices.insert(*adjacency_it); + temp.insert(*adjacency_it); + } + } + } + current = temp; + } + int weight = vertices.size(); + for (auto& v : vertices) { + for (auto& u : vertices) { + if (boost::edge(v, u, graph).second) { + weight++; + } + } + } + Message message(gfd_index, center, candidate); + if (weighted_messages.find(weight) != weighted_messages.end()) { + weighted_messages.at(weight).push_back(message); + } else { + std::vector current = {message}; + weighted_messages.emplace(weight, current); + } + } + } +} + +struct CheckCallback { +private: + graph_t const& query; + graph_t const& graph; + const std::vector premises; + const std::vector conclusion; + bool& res; + +public: + CheckCallback(graph_t const& query_, graph_t const& graph_, + std::vector const& premises_, std::vector const& conclusion_, + bool& res_) + : query(query_), graph(graph_), premises(premises_), conclusion(conclusion_), res(res_) {} + + template + bool operator()(CorrespondenceMap1To2 f, CorrespondenceMap2To1) const { + auto satisfied = [this, &f](const std::vector literals) { + for (const Literal& l : literals) { + auto fst_token = l.first; + auto snd_token = l.second; + std::string fst; + std::string snd; + if (fst_token.first == -1) { + fst = fst_token.second; + } else { + vertex_t v; + vertex_t u = boost::vertex(fst_token.first, query); + v = get(f, u); + auto attrs = graph[v].attributes; + if (attrs.find(fst_token.second) == attrs.end()) { + return false; + } + fst = attrs.at(fst_token.second); + } + if (snd_token.first == -1) { + snd = snd_token.second; + } else { + vertex_t v; + vertex_t u = boost::vertex(fst_token.first, query); + v = get(f, u); + auto attrs = graph[v].attributes; + if (attrs.find(snd_token.second) == attrs.end()) { + return false; + } + fst = attrs.at(snd_token.second); + } + if (fst != snd) { + return false; + } + } + return true; + }; + + if (!satisfied(premises)) { + return true; + } + if (!satisfied(conclusion)) { + res = false; + return false; + } + return true; + } +}; + +struct VCompare { + graph_t const& pattern; + graph_t const& graph; + vertex_t pinted_fr; + vertex_t pinted_to; + + bool operator()(vertex_t fr, vertex_t to) const { + if (fr == pinted_fr && to == pinted_to) { + return true; + } + if (fr == pinted_fr || to == pinted_to) { + return false; + } + return pattern[fr].attributes.at("label") == graph[to].attributes.at("label"); + } +}; + +struct ECompare { + graph_t const& pattern; + graph_t const& graph; + + bool operator()(edge_t fr, edge_t to) const { + return pattern[fr].label == graph[to].label; + } +}; + +void CalculateUnsatisfied(graph_t const& graph, std::vector const& messages, + std::map const& indexed_gfds, std::set& unsatisfied) { + for (auto& message : messages) { + int gfd_index = std::get<0>(message); + if (unsatisfied.find(gfd_index) != unsatisfied.end()) { + continue; + } + + vertex_t u = std::get<1>(message); + vertex_t v = std::get<2>(message); + + Gfd gfd = indexed_gfds.at(gfd_index); + graph_t pattern = gfd.GetPattern(); + + VCompare vcompare{pattern, graph, u, v}; + ECompare ecompare{pattern, graph}; + + bool satisfied = true; + CheckCallback callback(pattern, graph, gfd.GetPremises(), gfd.GetConclusion(), satisfied); + + boost::vf2_subgraph_iso(pattern, graph, callback, get(boost::vertex_index, pattern), + get(boost::vertex_index, graph), vertex_order_by_mult(pattern), + ecompare, vcompare); + if (!satisfied) { + unsatisfied.insert(gfd_index); + } + } +} + +} // namespace + +namespace algos { + +GfdValidation::GfdValidation() : GfdHandler() { + RegisterOption(config::ThreadNumberOpt(&threads_num_)); + MakeOptionsAvailable({config::ThreadNumberOpt.GetName()}); +}; + +std::vector GfdValidation::GenerateSatisfiedGfds(graph_t const& graph, + std::vector const& gfds) { + std::vector> requests = {}; + for (int i = 0; i < threads_num_; ++i) { + std::vector empty = {}; + requests.push_back(empty); + } + + std::map indexed_gfds; + int index = 0; + for (auto& gfd : gfds) { + int radius = 0; + vertex_t center = GetCenter(gfd.GetPattern(), radius); + std::vector candidates = + GetCandidates(graph, gfd.GetPattern()[center].attributes.at("label")); + auto partition = GetPartition(candidates, threads_num_); + for (std::size_t i = 0; i < partition.size(); ++i) { + if (!partition.at(i).empty()) { + Request request(index, center, radius, partition.at(i)); + requests[i].push_back(request); + } + } + indexed_gfds.emplace(index, gfd); + index++; + } + + std::vector>> weighted_messages; + for (int i = 0; i < threads_num_; ++i) { + std::map> empty; + weighted_messages.push_back(empty); + } + + std::vector threads = {}; + for (int i = 0; i < threads_num_; ++i) { + std::thread thrd(CalculateMessages, std::cref(graph), std::cref(requests.at(i)), + std::ref(weighted_messages.at(i))); + threads.push_back(std::move(thrd)); + } + for (std::thread& thrd : threads) { + if (thrd.joinable()) { + thrd.join(); + } + } + + std::map> all_weighted_messages; + for (int i = 0; i < threads_num_; ++i) { + for (auto& kv : weighted_messages.at(i)) { + all_weighted_messages.emplace(kv.first, kv.second); + } + } + + std::vector weights = {}; + for (auto& kv : all_weighted_messages) { + weights.push_back(kv.first); + } + + // balance + Balancer balancer; + std::vector> balanced_weights = balancer.Balance(weights, threads_num_); + std::vector> balanced_messages = {}; + for (std::size_t i = 0; i < balanced_weights.size(); ++i) { + std::vector messages = {}; + for (int const& weight : balanced_weights.at(i)) { + Message message = *all_weighted_messages.at(weight).begin(); + all_weighted_messages.at(weight).erase(all_weighted_messages.at(weight).begin()); + messages.push_back(message); + } + balanced_messages.push_back(messages); + } + + std::vector result = {}; + std::vector> unsatisfied = {}; + for (int i = 0; i < threads_num_; ++i) { + std::set empty = {}; + unsatisfied.push_back(empty); + } + + std::cout << "Messages constructed. Matching..." << std::endl; + // calculate unsatisfied forall processor (vf2) + threads.clear(); + for (int i = 0; i < threads_num_; ++i) { + std::thread thrd(CalculateUnsatisfied, std::cref(graph), std::cref(balanced_messages.at(i)), + std::cref(indexed_gfds), std::ref(unsatisfied.at(i))); + threads.push_back(std::move(thrd)); + } + for (std::thread& thrd : threads) { + if (thrd.joinable()) { + thrd.join(); + } + } + // concatmap unsatisfied + std::set all_unsatisfied = {}; + for (int i = 0; i < threads_num_; ++i) { + for (auto& ind : unsatisfied.at(i)) { + all_unsatisfied.insert(ind); + } + } + + for (std::size_t i = 0; i < indexed_gfds.size(); ++i) { + if (all_unsatisfied.find(i) == all_unsatisfied.end()) { + result.push_back(indexed_gfds.at(i)); + } + } + return result; +} + +} // namespace algos diff --git a/src/core/algorithms/gfd/gfd_validation.h b/src/core/algorithms/gfd/gfd_validation.h new file mode 100644 index 0000000000..fe6ec03c21 --- /dev/null +++ b/src/core/algorithms/gfd/gfd_validation.h @@ -0,0 +1,27 @@ +#pragma once +#include + +#include "algorithms/algorithm.h" +#include "algorithms/gfd/gfd_handler.h" +#include "config/names_and_descriptions.h" +#include "config/thread_number/type.h" +#include "gfd.h" + +namespace algos { + +using Request = std::tuple>; +using Message = std::tuple; + +class GfdValidation : public GfdHandler { +private: + config::ThreadNumType threads_num_; + +public: + std::vector GenerateSatisfiedGfds(graph_t const& graph, std::vector const& gfds); + + GfdValidation(); + + GfdValidation(graph_t graph_, std::vector gfds_) : GfdHandler(graph_, gfds_) {} +}; + +} // namespace algos diff --git a/src/core/config/descriptions.h b/src/core/config/descriptions.h index 7e3925984a..a32fb88d2e 100644 --- a/src/core/config/descriptions.h +++ b/src/core/config/descriptions.h @@ -86,4 +86,6 @@ constexpr auto kDIgnoreNullCols = constexpr auto kDIgnoreConstantCols = "Ignore INDs which contain columns filled with only one value. May " "increase performance but impacts the result. [true|false]"; +constexpr auto kDGraphData = "Path to dot-file with graph"; +constexpr auto kDGfdData = "Path to file with GFD"; } // namespace config::descriptions diff --git a/src/core/config/names.h b/src/core/config/names.h index 36a7100c27..c560f8d30f 100644 --- a/src/core/config/names.h +++ b/src/core/config/names.h @@ -49,4 +49,6 @@ constexpr auto kSampleSize = "sample_size"; constexpr auto kFindNary = "find_nary"; constexpr auto kIgnoreNullCols = "ignore_null_cols"; constexpr auto kIgnoreConstantCols = "ignore_constant_cols"; +constexpr auto kGraphData = "graph"; +constexpr auto kGfdData = "gfd"; } // namespace config::names From fd594b9d65c1699177272729b7ad68c92354798d Mon Sep 17 00:00:00 2001 From: AntonChern Date: Mon, 4 Dec 2023 17:47:46 +0300 Subject: [PATCH 4/6] Implement efficient GFD validation algorithm The algorithm checking the satisfiability of GFD. Uses CPI algorithm as a subgraph search. --- src/core/algorithms/algorithm_types.h | 11 +- src/core/algorithms/algorithms.h | 1 + src/core/algorithms/gfd/egfd_validation.cpp | 1043 +++++++++++++++++++ src/core/algorithms/gfd/egfd_validation.h | 19 + 4 files changed, 1069 insertions(+), 5 deletions(-) create mode 100644 src/core/algorithms/gfd/egfd_validation.cpp create mode 100644 src/core/algorithms/gfd/egfd_validation.h diff --git a/src/core/algorithms/algorithm_types.h b/src/core/algorithms/algorithm_types.h index 777933dce9..313f169622 100644 --- a/src/core/algorithms/algorithm_types.h +++ b/src/core/algorithms/algorithm_types.h @@ -6,10 +6,10 @@ namespace algos { -using AlgorithmTypes = - std::tuple; +using AlgorithmTypes = std::tuple; // clang-format off /* Enumeration of all supported non-pipeline algorithms. If you implement a new @@ -60,7 +60,8 @@ BETTER_ENUM(AlgorithmType, char, faida, /* Graph functional dependency mining algorithms */ - gfdvalid + gfdvalid, + egfdvalid ) // clang-format on diff --git a/src/core/algorithms/algorithms.h b/src/core/algorithms/algorithms.h index 843e01381d..4db60e0c6b 100644 --- a/src/core/algorithms/algorithms.h +++ b/src/core/algorithms/algorithms.h @@ -39,4 +39,5 @@ #include "algorithms/ind/faida/faida.h" /* Graph functional dependency mining algorithms */ +#include "algorithms/gfd/egfd_validation.h" #include "algorithms/gfd/gfd_validation.h" diff --git a/src/core/algorithms/gfd/egfd_validation.cpp b/src/core/algorithms/gfd/egfd_validation.cpp new file mode 100644 index 0000000000..15b1437791 --- /dev/null +++ b/src/core/algorithms/gfd/egfd_validation.cpp @@ -0,0 +1,1043 @@ +#include "egfd_validation.h" + +#include + +#include + +#include "config/equal_nulls/option.h" +#include "config/names_and_descriptions.h" +#include "config/option_using.h" +#include "config/tabular_data/input_table/option.h" + +namespace { + +using namespace algos; +using Match = std::vector::iterator, std::set::iterator>>; + +void FstStepForest(graph_t const& graph, std::map>& rooted_subtree, + std::map& children_amount) { + typename boost::graph_traits::vertex_iterator it, end; + for (boost::tie(it, end) = vertices(graph); it != end; ++it) { + if (boost::degree(*it, graph) != 1) { + continue; + } + typename boost::graph_traits::adjacency_iterator adjacency_it = + boost::adjacent_vertices(*it, graph).first; + + if (rooted_subtree.find(*adjacency_it) != rooted_subtree.end()) { + rooted_subtree.at(*adjacency_it).insert(*it); + children_amount[*adjacency_it]++; + } else { + std::set value = {*it}; + rooted_subtree.emplace(*adjacency_it, value); + children_amount.emplace(*adjacency_it, 1); + } + } +} + +void BuildForest(graph_t const& graph, std::map>& rooted_subtree, + std::map& children_amount) { + bool changed = true; + while (changed) { + changed = false; + std::map> temp; + std::map children_temp; + for (auto const& kv : rooted_subtree) { + auto desc = kv.first; + auto children = kv.second; + + if (boost::degree(desc, graph) == (children_amount.at(desc) + 1)) { + changed = true; + typename boost::graph_traits::adjacency_iterator adjacency_it, + adjacency_end; + boost::tie(adjacency_it, adjacency_end) = boost::adjacent_vertices(desc, graph); + for (; adjacency_it != adjacency_end; ++adjacency_it) { + if (children.find(*adjacency_it) != children.end()) { + continue; + } + if (temp.find(*adjacency_it) != temp.end()) { + std::set value = {}; + auto current = temp.at(*adjacency_it); + std::set_union(children.begin(), children.end(), current.begin(), + current.end(), std::inserter(value, value.begin())); + value.insert(desc); + temp[*adjacency_it] = value; + children_temp[*adjacency_it]++; + } else { + std::set value = children; + value.insert(desc); + temp.emplace(*adjacency_it, value); + children_temp.emplace(*adjacency_it, 1); + } + break; + } + } else { + temp.emplace(desc, children); + children_temp.emplace(desc, children_amount.at(desc)); + } + } + rooted_subtree = temp; + children_amount = children_temp; + } +} + +void CfDecompose(graph_t const& graph, std::set& core, + std::vector>& forest) { + if (boost::num_vertices(graph) == (boost::num_edges(graph) + 1)) { + typename boost::graph_traits::vertex_iterator it, end; + for (boost::tie(it, end) = vertices(graph); it != end; ++it) { + core.insert(*it); + } + return; + } + + std::map> rooted_subtree; + std::map children_amount; + FstStepForest(graph, rooted_subtree, children_amount); + BuildForest(graph, rooted_subtree, children_amount); + + std::set not_core_indices = {}; + for (auto const& kv : rooted_subtree) { + for (int child : kv.second) { + not_core_indices.insert(child); + } + } + typename boost::graph_traits::vertex_iterator it, end; + for (boost::tie(it, end) = vertices(graph); it != end; ++it) { + if (not_core_indices.find(*it) == not_core_indices.end()) { + core.insert(*it); + } + } + + for (auto const& kv : rooted_subtree) { + std::set indices(kv.second); + indices.insert(kv.first); + forest.push_back(indices); + } +} + +int Mnd(graph_t const& graph, vertex_t const& v) { + typename boost::graph_traits::adjacency_iterator adjacency_it, adjacency_end; + boost::tie(adjacency_it, adjacency_end) = boost::adjacent_vertices(v, graph); + std::size_t result = 0; + for (; adjacency_it != adjacency_end; ++adjacency_it) { + if (result < boost::degree(*adjacency_it, graph)) { + result = boost::degree(*adjacency_it, graph); + } + } + return result; +} + +void CountLabelDegrees(graph_t const& graph, vertex_t const& v, + std::map& result) { + typename boost::graph_traits::adjacency_iterator adjacency_it, adjacency_end; + boost::tie(adjacency_it, adjacency_end) = boost::adjacent_vertices(v, graph); + for (; adjacency_it != adjacency_end; ++adjacency_it) { + if (result.find(graph[*adjacency_it].attributes.at("label")) != result.end()) { + result[graph[*adjacency_it].attributes.at("label")]++; + } else { + result.emplace(graph[*adjacency_it].attributes.at("label"), 1); + } + } +} + +bool CandVerify(graph_t const& graph, vertex_t const& v, graph_t const& query, vertex_t const& u) { + if (Mnd(graph, v) < Mnd(query, u)) { + return false; + } + std::map graph_label_degrees; + CountLabelDegrees(graph, v, graph_label_degrees); + std::map query_label_degrees; + CountLabelDegrees(query, u, query_label_degrees); + + for (auto const& label_degree : query_label_degrees) { + std::string const& label = label_degree.first; + std::size_t const& degree = label_degree.second; + if (graph_label_degrees.find(label) == graph_label_degrees.end() || + graph_label_degrees.at(label) < degree) { + return false; + } + } + return true; +} + +void SortComplexity(std::vector& order, graph_t const& graph, graph_t const& query, + std::map> const& label_classes) { + auto cmpComplexity = [&graph, &query, &label_classes](vertex_t const& a, vertex_t const& b) { + std::size_t a_degree = boost::degree(a, query); + int an = 0; + for (const vertex_t& e : label_classes.at(query[a].attributes.at("label"))) { + if (boost::degree(e, graph) >= a_degree) { + an++; + } + } + + std::size_t b_degree = boost::degree(b, query); + int bn = 0; + for (const vertex_t& e : label_classes.at(query[b].attributes.at("label"))) { + if (boost::degree(e, graph) >= b_degree) { + bn++; + } + } + return an / a_degree < bn / b_degree; + }; + std::sort(order.begin(), order.end(), cmpComplexity); +} + +void SortAccurateComplexity(std::vector& order, graph_t const& graph, + graph_t const& query, + std::map> const& label_classes) { + int top = std::min(int(order.size()), 3); + auto cmpAccurateComplexity = [&graph, &query, &label_classes](vertex_t const& a, + vertex_t const& b) { + int a_degree = boost::degree(a, query); + int an = 0; + for (const vertex_t& e : label_classes.at(query[a].attributes.at("label"))) { + if (CandVerify(graph, e, query, a)) { + an++; + } + } + + int b_degree = boost::degree(b, query); + int bn = 0; + for (const vertex_t& e : label_classes.at(query[b].attributes.at("label"))) { + if (CandVerify(graph, e, query, b)) { + bn++; + } + } + return an / a_degree < bn / b_degree; + }; + std::sort(order.begin(), std::next(order.begin(), top), cmpAccurateComplexity); +} + +int GetRoot(graph_t const& graph, graph_t const& query, std::set const& core) { + std::map> label_classes; + typename boost::graph_traits::vertex_iterator it, end; + for (boost::tie(it, end) = vertices(graph); it != end; ++it) { + if (label_classes.find(graph[*it].attributes.at("label")) != label_classes.end()) { + label_classes[graph[*it].attributes.at("label")].insert(*it); + } else { + std::set value = {*it}; + label_classes.emplace(graph[*it].attributes.at("label"), value); + } + } + std::vector order(core.begin(), core.end()); + + SortComplexity(order, graph, query, label_classes); + SortAccurateComplexity(order, graph, query, label_classes); + return *order.begin(); +} + +void MakeLevels(graph_t const& query, vertex_t const& root, std::vector>& levels, + std::map& parent) { + std::set current = {root}; + std::set marked = {root}; + while (!current.empty()) { + levels.push_back(current); + std::set next = {}; + for (vertex_t const& vertex : current) { + typename boost::graph_traits::adjacency_iterator adjacency_it, adjacency_end; + boost::tie(adjacency_it, adjacency_end) = boost::adjacent_vertices(vertex, query); + for (; adjacency_it != adjacency_end; ++adjacency_it) { + if (marked.find(*adjacency_it) == marked.end()) { + marked.insert(*adjacency_it); + next.insert(*adjacency_it); + parent.emplace(*adjacency_it, vertex); + } + } + } + current = next; + } +} + +void MakeNte(graph_t const& query, std::vector>& levels, + std::map& parent, std::set& nte, std::set& snte) { + typename boost::graph_traits::edge_iterator it_edge, end_edge; + for (boost::tie(it_edge, end_edge) = edges(query); it_edge != end_edge; ++it_edge) { + vertex_t origin = boost::source(*it_edge, query); + vertex_t finish = boost::target(*it_edge, query); + if ((parent.find(origin) != parent.end()) && (parent.find(finish) != parent.end()) && + (parent.at(origin) != finish) && (parent.at(finish) != origin)) { + int origin_level = 0; + int finish_level = 0; + for (std::size_t i = 0; i < levels.size(); ++i) { + if (levels.at(i).find(origin) != levels.at(i).end()) { + origin_level = i; + } + if (levels.at(i).find(finish) != levels.at(i).end()) { + finish_level = i; + } + } + if (origin_level == finish_level) { + snte.insert(*it_edge); + } + nte.insert(*it_edge); + } + } +} + +void BfsTree(graph_t const& query, vertex_t const& root, std::vector>& levels, + std::map& parent, std::set& nte, std::set& snte) { + MakeLevels(query, root, levels, parent); + MakeNte(query, levels, parent, nte, snte); +} + +void DirectConstruction(std::set const& lev, graph_t const& graph, graph_t const& query, + std::map>& candidates, + std::map& cnts, + std::map>& unvisited_neighbours, + std::set const& snte, std::set& visited) { + for (vertex_t const& u : lev) { + int cnt = 0; + typename boost::graph_traits::adjacency_iterator adjacency_it, adjacency_end; + boost::tie(adjacency_it, adjacency_end) = boost::adjacent_vertices(u, query); + for (; adjacency_it != adjacency_end; ++adjacency_it) { + if (visited.find(query[*adjacency_it].node_id) == visited.end() && + snte.find(boost::edge(*adjacency_it, u, query).first) != snte.end()) { + if (unvisited_neighbours.find(u) != unvisited_neighbours.end()) { + unvisited_neighbours.at(u).insert(*adjacency_it); + } else { + std::set value = {*adjacency_it}; + unvisited_neighbours.emplace(u, value); + } + } else if (visited.find(*adjacency_it) != visited.end()) { + for (vertex_t const& v : candidates.at(*adjacency_it)) { + typename boost::graph_traits::adjacency_iterator g_adj_it, g_adj_end; + boost::tie(g_adj_it, g_adj_end) = boost::adjacent_vertices(v, graph); + for (; g_adj_it != g_adj_end; ++g_adj_it) { + if (graph[*g_adj_it].attributes.at("label") == + query[u].attributes.at("label") && + boost::degree(*g_adj_it, graph) >= boost::degree(u, query)) { + if (cnts.find(*g_adj_it) == cnts.end()) { + if (cnt == 0) { + cnts.emplace(*g_adj_it, 1); + } + } else { + if (cnts.at(*g_adj_it) == cnt) { + cnts[*g_adj_it]++; + } + } + } + } + } + cnt++; + } + } + typename boost::graph_traits::vertex_iterator g_it, g_end; + for (boost::tie(g_it, g_end) = vertices(graph); g_it != g_end; ++g_it) { + if (((cnts.find(*g_it) == cnts.end()) && (cnt == 0)) || + ((cnts.find(*g_it) != cnts.end()) && (cnts.at(*g_it) == cnt))) { + if (CandVerify(graph, *g_it, query, u)) { + candidates.at(u).insert(*g_it); + } + } + } + visited.insert(u); + cnts.clear(); + } +} + +void ReverseConstruction(std::set const& lev, graph_t const& graph, graph_t const& query, + std::map>& candidates, + std::map& cnts, + std::map>& unvisited_neighbours) { + for (std::set::iterator j = --lev.end(); j != std::next(lev.begin(), -1); --j) { + vertex_t u = *j; + int cnt = 0; + if (unvisited_neighbours.find(u) != unvisited_neighbours.end()) { + for (vertex_t const& un : unvisited_neighbours.at(u)) { + for (vertex_t const& v : candidates.at(un)) { + typename boost::graph_traits::adjacency_iterator g_adj_it, g_adj_end; + boost::tie(g_adj_it, g_adj_end) = + boost::adjacent_vertices(boost::vertex(v, graph), graph); + for (; g_adj_it != g_adj_end; ++g_adj_it) { + if (graph[*g_adj_it].attributes.at("label") == + query[u].attributes.at("label") && + boost::degree(*g_adj_it, graph) >= boost::degree(u, query)) { + if (cnts.find(*g_adj_it) == cnts.end()) { + if (cnt == 0) { + cnts.emplace(*g_adj_it, 1); + } + } else { + if (cnts.at(*g_adj_it) == cnt) { + cnts[*g_adj_it]++; + } + } + } + } + } + cnt++; + } + } + + std::set to_delete = {}; + for (vertex_t const& v : candidates.at(u)) { + if (!(((cnts.find(v) == cnts.end()) && (cnt == 0)) || + ((cnts.find(v) != cnts.end()) && (cnts.at(v) == cnt)))) { + to_delete.insert(v); + } + } + for (vertex_t const& d : to_delete) { + candidates.at(u).erase(d); + } + cnts.clear(); + } +} + +void FinalConstruction(std::set const& lev, CPI& cpi, graph_t const& graph, + graph_t const& query, std::map const& parent, + std::map>& candidates) { + for (vertex_t const& u : lev) { + vertex_t up = parent.at(u); + for (vertex_t const& vp : candidates.at(up)) { + typename boost::graph_traits::adjacency_iterator g_adj_it, g_adj_end; + boost::tie(g_adj_it, g_adj_end) = boost::adjacent_vertices(vp, graph); + for (; g_adj_it != g_adj_end; ++g_adj_it) { + if (graph[*g_adj_it].attributes.at("label") == query[u].attributes.at("label") && + boost::degree(*g_adj_it, graph) >= boost::degree(u, query) && + candidates.at(u).find(*g_adj_it) != candidates.at(u).end()) { + std::pair cpi_edge(up, u); + if (cpi.find(cpi_edge) != cpi.end()) { + if (cpi.at(cpi_edge).find(vp) != cpi.at(cpi_edge).end()) { + cpi.at(cpi_edge).at(vp).insert(*g_adj_it); + } else { + std::set value = {*g_adj_it}; + cpi.at(cpi_edge).emplace(vp, value); + } + } else { + std::map> edge_map; + std::set value = {*g_adj_it}; + edge_map.emplace(vp, value); + cpi.emplace(cpi_edge, edge_map); + } + } + } + } + } +} + +void TopDownConstruct(CPI& cpi, graph_t const& graph, graph_t const& query, + std::vector> const& levels, + std::map const& parent, + std::map>& candidates, + std::set const& snte) { + vertex_t root = *levels.at(0).begin(); + typename boost::graph_traits::vertex_iterator it, end; + for (boost::tie(it, end) = vertices(query); it != end; ++it) { + std::set empty = {}; + candidates.emplace(*it, empty); + } + + for (boost::tie(it, end) = vertices(graph); it != end; ++it) { + if (graph[*it].attributes.at("label") == query[root].attributes.at("label") && + boost::degree(*it, graph) >= boost::degree(root, query) && + CandVerify(graph, *it, query, root)) { + candidates.at(root).insert(*it); + } + } + std::set visited = {root}; + std::map> unvisited_neighbours; + std::map cnts; + + std::vector>::const_iterator i = std::next(levels.cbegin()); + for (; i != levels.cend(); ++i) { + std::set lev = *i; + DirectConstruction(lev, graph, query, candidates, cnts, unvisited_neighbours, snte, + visited); + ReverseConstruction(lev, graph, query, candidates, cnts, unvisited_neighbours); + FinalConstruction(lev, cpi, graph, query, parent, candidates); + } +} + +void InitialRefinement(vertex_t const& u, graph_t const& graph, graph_t const& query, + std::map const& parent, + std::map>& candidates, + std::map& cnts, int& cnt) { + typename boost::graph_traits::adjacency_iterator q_adj_it, q_adj_end; + boost::tie(q_adj_it, q_adj_end) = boost::adjacent_vertices(u, query); + for (; q_adj_it != q_adj_end; ++q_adj_it) { + if ((parent.find(*q_adj_it) != parent.end()) && (parent.at(*q_adj_it) == u)) { + for (vertex_t const& v : candidates.at(*q_adj_it)) { + typename boost::graph_traits::adjacency_iterator g_adj_it, g_adj_end; + boost::tie(g_adj_it, g_adj_end) = boost::adjacent_vertices(v, graph); + for (; g_adj_it != g_adj_end; ++g_adj_it) { + if (graph[*g_adj_it].attributes.at("label") == + query[u].attributes.at("label") && + boost::degree(*g_adj_it, graph) >= boost::degree(u, query)) { + if (cnts.find(*g_adj_it) == cnts.end()) { + if (cnt == 0) { + cnts.emplace(*g_adj_it, 1); + } + } else { + if (cnts.at(*g_adj_it) == cnt) { + cnts[*g_adj_it]++; + } + } + } + } + } + cnt++; + } + } +} + +void OddDeletion(vertex_t const& u, CPI& cpi, std::map>& candidates, + std::map& cnts, int& cnt) { + std::set to_delete = {}; + for (vertex_t const& v : candidates.at(u)) { + if (!(((cnts.find(v) == cnts.end()) && (cnt == 0)) || + ((cnts.find(v) != cnts.end()) && (cnts.at(v) == cnt)))) { + to_delete.insert(v); + } + } + for (vertex_t const& d : to_delete) { + candidates.at(u).erase(d); + for (auto const& e : cpi) { + if (e.second.find(d) != e.second.end()) { + cpi.at(e.first).erase(d); + } + } + } + cnts.clear(); +} + +void FinalRefinement(vertex_t const& u, CPI& cpi, graph_t const& query, + std::map const& parent, + std::map>& candidates) { + for (vertex_t const& v : candidates.at(u)) { + typename boost::graph_traits::adjacency_iterator q_adj_it, q_adj_end; + boost::tie(q_adj_it, q_adj_end) = boost::adjacent_vertices(u, query); + for (; q_adj_it != q_adj_end; ++q_adj_it) { + vertex_t u_ = *q_adj_it; + if ((parent.find(u_) != parent.end()) && (parent.at(u_) == u)) { + std::pair cpi_edge(u, u_); + for (vertex_t const& v_ : cpi.at(cpi_edge).at(v)) { + if (candidates.at(u_).find(v_) == candidates.at(u_).end()) { + cpi.at(cpi_edge).at(v).erase(v_); + } + } + } + } + } +} + +void BottomUpRefinement(CPI& cpi, graph_t const& graph, graph_t const& query, + std::vector> const& levels, + std::map const& parent, + std::map>& candidates) { + std::map cnts; + + std::vector>::const_iterator lev_it; + for (lev_it = --levels.cend(); lev_it != std::next(levels.begin(), -1); --lev_it) { + for (vertex_t const& u : *lev_it) { + int cnt = 0; + InitialRefinement(u, graph, query, parent, candidates, cnts, cnt); + OddDeletion(u, cpi, candidates, cnts, cnt); + FinalRefinement(u, cpi, query, parent, candidates); + } + } +} + +int NumOfEmbeddings(const CPI& cpi, std::vector const& path, vertex_t const& origin) { + std::map, int> result; + std::pair edge(*(std::next(path.end(), -2)), *(std::next(path.end(), -1))); + for (auto& vert_cans : cpi.at(edge)) { + for (vertex_t const& can : vert_cans.second) { + std::pair key(*(std::next(path.end(), -1)), can); + result.emplace(key, 1); + } + } + + for (int i = path.size() - 1; path.at(i) != origin; --i) { + std::map, int> new_result; + std::pair cur_edge(path.at(i - 1), path.at(i)); + for (auto& vert_cans : cpi.at(cur_edge)) { + for (vertex_t const& can : vert_cans.second) { + std::pair key(path.at(i - 1), vert_cans.first); + std::pair counted(path.at(i), can); + if (new_result.find(key) != new_result.end()) { + new_result[key] += result.at(counted); + } else { + new_result.emplace(key, result.at(counted)); + } + } + } + result = new_result; + } + int answer = 0; + for (auto& values : result) { + answer += values.second; + } + return answer; +} + +int CandidatesCardinality(const CPI& cpi, vertex_t const& u) { + for (auto& edge_cans : cpi) { + auto edge = edge_cans.first; + if (edge.first == u) { + return edge_cans.second.size(); + } + } + return 1; +} + +void BuildOptimalSeq(const CPI& cpi, std::vector> const& paths_origin, + std::vector const& NTs, + std::vector> const& paths, std::vector& pi) { + auto cmp = [&cpi, &paths_origin, &NTs](std::vector const& a, + std::vector const& b) { + int nta = NTs.at(std::find(paths_origin.begin(), paths_origin.end(), a) - + paths_origin.begin()); + int ntb = NTs.at(std::find(paths_origin.begin(), paths_origin.end(), b) - + paths_origin.begin()); + if (nta == 0) { + return false; + } + if (ntb == 0) { + return true; + } + return NumOfEmbeddings(cpi, a, *a.begin()) / nta < + NumOfEmbeddings(cpi, b, *b.begin()) / ntb; + }; + pi = *std::min_element(paths.begin(), paths.end(), cmp); +} + +void BuildAccurateOptimalSeq(const CPI& cpi, std::vector> const& paths_origin, + std::vector const& origins, + std::vector> const& paths, + std::vector& pi) { + auto cmp = [&cpi, &paths_origin, &origins](std::vector const& a, + std::vector const& b) { + int a_origin = origins.at(std::find(paths_origin.begin(), paths_origin.end(), a) - + paths_origin.begin()); + int b_origin = origins.at(std::find(paths_origin.begin(), paths_origin.end(), b) - + paths_origin.begin()); + return NumOfEmbeddings(cpi, a, a_origin) / CandidatesCardinality(cpi, a_origin) < + NumOfEmbeddings(cpi, b, b_origin) / CandidatesCardinality(cpi, b_origin); + }; + pi = *std::min_element(paths.begin(), paths.end(), cmp); +} + +std::vector MatchingOrder(const CPI& cpi, + std::vector> const& paths_origin, + std::vector const& NTs) { + std::vector> paths; + std::copy(paths_origin.begin(), paths_origin.end(), std::back_inserter(paths)); + + std::vector pi; + BuildOptimalSeq(cpi, paths_origin, NTs, paths, pi); + std::vector origins; + for (auto& path : paths_origin) { + if (path == pi) { + origins.push_back(*path.begin()); + continue; + } + std::vector::iterator pi_it = pi.begin(); + std::vector::const_iterator path_it = path.cbegin(); + for (; *pi_it == *path_it; ++pi_it, ++path_it) { + } + origins.push_back(*(--pi_it)); + } + paths.erase(std::remove(paths.begin(), paths.end(), pi), paths.end()); + + std::vector seq; + std::copy(pi.begin(), pi.end(), std::back_inserter(seq)); + while (!paths.empty()) { + std::vector pi_new; + BuildAccurateOptimalSeq(cpi, paths_origin, origins, paths, pi_new); + std::vector::iterator pi_it = pi_new.begin(); + std::vector::iterator seq_it = seq.begin(); + for (; *pi_it == *seq_it; ++pi_it, ++seq_it) { + } + seq.insert(seq_it, pi_it, pi_new.end()); + paths.erase(std::remove(paths.begin(), paths.end(), pi_new), paths.end()); + } + return seq; +} + +bool ValidateNt(graph_t const& graph, vertex_t const& v, graph_t const& query, vertex_t const& u, + std::vector const& seq, std::map const& parent, + Match match) { + int index = std::find(seq.begin(), seq.end(), u) - seq.begin(); + for (int i = 0; i < index; ++i) { + if ((seq.at(i) != parent.at(u)) && boost::edge(seq.at(i), u, query).second) { + if (!boost::edge(*match.at(i).first, v, graph).second) { + return false; + } + } + } + return true; +} + +std::vector> GetPaths(std::set const& indices, + std::map const& parent_) { + std::vector> result = {}; + std::map parent(parent_); + std::set to_delete = {}; + for (auto& link : parent) { + if (indices.find(link.first) == indices.end()) { + to_delete.insert(link.first); + } + } + for (auto& index : to_delete) { + parent.erase(index); + } + std::set keys = {}; + std::set values = {}; + for (auto& kv : parent) { + keys.insert(kv.first); + values.insert(kv.second); + } + std::set leaves = {}; + std::set_difference(keys.begin(), keys.end(), values.begin(), values.end(), + std::inserter(leaves, leaves.begin())); + for (vertex_t const& leaf : leaves) { + std::vector path = {leaf}; + vertex_t cur = leaf; + while ((parent.find(cur) != parent.end()) && + (indices.find(parent.at(cur)) != indices.end())) { + cur = parent.at(cur); + path.push_back(cur); + } + std::reverse(path.begin(), path.end()); + result.push_back(path); + } + return result; +} + +bool Visited(Match const& match, vertex_t const& v, std::size_t const& index) { + for (std::size_t i = 0; i < match.size(); ++i) { + if (i != index) { + if ((match.at(i).first != match.at(i).second) && (*match.at(i).first == v)) { + return true; + } + } + } + return false; +} + +bool Satisfied(graph_t const& graph, graph_t const& query, std::vector const& seq, + Match const& match, std::vector const& literals) { + for (Literal const& l : literals) { + auto fst_token = l.first; + auto snd_token = l.second; + std::string fst; + std::string snd; + if (fst_token.first == -1) { + fst = fst_token.second; + } else { + vertex_t v; + vertex_t u = boost::vertex(fst_token.first, query); + int index = std::find(seq.begin(), seq.end(), u) - seq.begin(); + v = *match.at(index).first; + auto attrs = graph[v].attributes; + if (attrs.find(fst_token.second) == attrs.end()) { + return false; + } + fst = attrs.at(fst_token.second); + } + if (snd_token.first == -1) { + snd = snd_token.second; + } else { + vertex_t v; + vertex_t u = boost::vertex(fst_token.first, query); + int index = std::find(seq.begin(), seq.end(), u) - seq.begin(); + v = *match.at(index).first; + auto attrs = graph[v].attributes; + if (attrs.find(snd_token.second) == attrs.end()) { + return false; + } + fst = attrs.at(snd_token.second); + } + if (fst != snd) { + return false; + } + } + return true; +} + +void FullNTs(std::vector> const& paths, std::set const& nte, + graph_t const& query, std::vector& NTs) { + for (auto& path : paths) { + int nt = 0; + for (auto& desc : nte) { + vertex_t source = boost::source(desc, query); + vertex_t target = boost::target(desc, query); + if ((std::find(path.begin(), path.end(), source) != path.end()) || + (std::find(path.begin(), path.end(), target) != path.end())) { + nt++; + } + } + NTs.push_back(nt); + } +} + +void CompleteSeq(CPI& cpi, std::vector> const& forest, + std::map const& parent, graph_t const& query, + std::set const& nte, std::vector& seq) { + for (auto& tree : forest) { + std::vector> tree_paths = GetPaths(tree, parent); + + std::vector tree_NTs = {}; + for (auto& path : tree_paths) { + int nt = 0; + for (auto& desc : nte) { + vertex_t source = boost::source(desc, query); + vertex_t target = boost::target(desc, query); + if ((std::find(path.begin(), path.end(), source) != path.end()) || + (std::find(path.begin(), path.end(), target) != path.end())) { + nt++; + } + } + tree_NTs.push_back(nt); + } + std::vector tree_seq = MatchingOrder(cpi, tree_paths, tree_NTs); + + seq.insert(seq.end(), ++tree_seq.begin(), tree_seq.end()); + } +} + +bool FullMatch(CPI& cpi, Match& match, std::set const& root_candidates, + std::set const& core, std::vector const& seq, + std::map const& parent, graph_t const& graph, + graph_t const& query) { + match.push_back({root_candidates.begin(), root_candidates.end()}); + for (std::size_t i = 1; i < core.size(); ++i) { + std::pair edge(parent.at(seq.at(i)), seq.at(i)); + int index = std::find(seq.begin(), seq.end(), parent.at(seq.at(i))) - seq.begin(); + std::pair::iterator, std::set::iterator> its( + cpi.at(edge).at(*match.at(index).first).begin(), + cpi.at(edge).at(*match.at(index).first).end()); + match.push_back(its); + + while ((match.at(i).first != match.at(i).second) && + (Visited(match, *match.at(i).first, i) || + !ValidateNt(graph, *match.at(i).first, query, seq.at(i), seq, parent, match))) { + match.at(i).first++; + } + if (match.at(i).first == match.at(i).second) { + std::cout << "Trivially satisfied" << std::endl; + return true; + } + } + for (std::size_t i = core.size(); i < seq.size(); ++i) { + match.push_back({root_candidates.end(), root_candidates.end()}); + } + return false; +} + +void IncrementMatch(int& i, const CPI& cpi, Match& match, + std::map const& parent, std::set const& core, + std::vector const& seq, graph_t const& graph, graph_t const& query) { + while ((i != static_cast(core.size())) && (i != -1)) { + if (match.at(i).first == match.at(i).second) { + std::pair edge(parent.at(seq.at(i)), seq.at(i)); + std::size_t index = + std::find(seq.begin(), seq.end(), parent.at(seq.at(i))) - seq.begin(); + std::pair::iterator, std::set::iterator> its( + cpi.at(edge).at(*match.at(index).first).begin(), + cpi.at(edge).at(*match.at(index).first).end()); + match[i] = its; + } else { + match.at(i).first++; + } + + while ((match.at(i).first != match.at(i).second) && + (Visited(match, *match.at(i).first, i) || + !ValidateNt(graph, *match.at(i).first, query, seq.at(i), seq, parent, match))) { + match.at(i).first++; + }; + + if (match.at(i).first == match.at(i).second) { + i--; + continue; + } + i++; + } +} + +bool CheckTrivially(const CPI& cpi, Match& match, std::map const& parent, + std::set const& core, std::vector const& seq) { + for (std::size_t k = core.size(); k < seq.size(); ++k) { + std::pair edge(parent.at(seq.at(k)), seq.at(k)); + std::size_t index = std::find(seq.begin(), seq.end(), parent.at(seq.at(k))) - seq.begin(); + std::pair::iterator, std::set::iterator> its( + cpi.at(edge).at(*match.at(index).first).begin(), + cpi.at(edge).at(*match.at(index).first).end()); + match[k] = its; + + while ((match.at(k).first != match.at(k).second) && Visited(match, *match.at(k).first, k)) { + match.at(k).first++; + } + if (match.at(k).first == match.at(k).second) { + std::cout << "Trivially satisfied" << std::endl; + return true; + } + } + return false; +} + +bool CheckMatch(const CPI& cpi, Match& match, std::map const& parent, + std::set const& core, std::vector const& seq, + graph_t const& graph, graph_t const& query, Gfd const& gfd, int& amount) { + while (true) { + std::size_t j = seq.size() - 1; + while ((j != seq.size()) && (j != core.size() - 1)) { + if (match.at(j).first == match.at(j).second) { + std::pair edge(parent.at(seq.at(j)), seq.at(j)); + std::size_t index = + std::find(seq.begin(), seq.end(), parent.at(seq.at(j))) - seq.begin(); + std::pair::iterator, std::set::iterator> its( + cpi.at(edge).at(*match.at(index).first).begin(), + cpi.at(edge).at(*match.at(index).first).end()); + match[j] = its; + } else { + match.at(j).first++; + } + + while ((match.at(j).first != match.at(j).second) && + Visited(match, *match.at(j).first, j)) { + match.at(j).first++; + }; + if (match.at(j).first == match.at(j).second) { + j--; + continue; + } + j++; + } + if (j == core.size() - 1) { + break; + } + + amount++; + // check + if (!Satisfied(graph, query, seq, match, gfd.GetPremises())) { + continue; + } + if (!Satisfied(graph, query, seq, match, gfd.GetConclusion())) { + std::cout << "Checked embeddings: " << amount << std::endl; + return false; + } + } + return true; +} + +bool Check(CPI& cpi, graph_t const& graph, Gfd const& gfd, std::set const& core, + std::vector> const& forest, + std::map const& parent, std::set const& nte) { + graph_t query = gfd.GetPattern(); + std::vector> paths = GetPaths(core, parent); + + std::vector NTs = {}; + FullNTs(paths, nte, query, NTs); + std::vector seq = MatchingOrder(cpi, paths, NTs); + + CompleteSeq(cpi, forest, parent, query, nte, seq); + + std::set root_candidates = {}; + std::pair edge(*seq.begin(), *(++seq.begin())); + for (auto& vertices : cpi.at(edge)) { + root_candidates.insert(vertices.first); + } + + std::vector::iterator, std::set::iterator>> match = {}; + + if (FullMatch(cpi, match, root_candidates, core, seq, parent, graph, query)) { + return true; + } + int amount = 1; + // check + if (Satisfied(graph, query, seq, match, gfd.GetPremises()) && + !Satisfied(graph, query, seq, match, gfd.GetConclusion())) { + std::cout << "Checked embeddings: " << amount << std::endl; + return false; + } + + while (true) { + int i = static_cast(core.size()) - 1; + IncrementMatch(i, cpi, match, parent, core, seq, graph, query); + if (i == -1) { + break; + } + + if (forest.empty()) { + amount++; + // check + if (!Satisfied(graph, query, seq, match, gfd.GetPremises())) { + continue; + } + if (!Satisfied(graph, query, seq, match, gfd.GetConclusion())) { + std::cout << "Checked embeddings: " << amount << std::endl; + return false; + } + continue; + } + + if (CheckTrivially(cpi, match, parent, core, seq)) { + return true; + } + + if (!CheckMatch(cpi, match, parent, core, seq, graph, query, gfd, amount)) { + return false; + } + } + std::cout << "total number of embeddings: " << amount << std::endl; + return true; +} + +bool Validate(graph_t const& graph, Gfd const& gfd) { + auto start_time = std::chrono::system_clock::now(); + + graph_t pat = gfd.GetPattern(); + std::set graph_labels = {}; + std::set pat_labels = {}; + typename boost::graph_traits::vertex_iterator it, end; + for (boost::tie(it, end) = vertices(graph); it != end; ++it) { + graph_labels.insert(graph[*it].attributes.at("label")); + } + for (boost::tie(it, end) = vertices(pat); it != end; ++it) { + pat_labels.insert(pat[*it].attributes.at("label")); + } + for (auto const& label : pat_labels) { + if (graph_labels.find(label) == graph_labels.end()) { + return true; + } + } + + std::set core = {}; + std::vector> forest = {}; + CfDecompose(pat, core, forest); + + int root = GetRoot(graph, pat, core); + std::vector> levels = {}; + std::map parent; + std::set snte = {}; + std::set nte = {}; + BfsTree(pat, root, levels, parent, nte, snte); + + std::map> candidates; + CPI cpi; + TopDownConstruct(cpi, graph, pat, levels, parent, candidates, snte); + BottomUpRefinement(cpi, graph, pat, levels, parent, candidates); + auto elapsed_milliseconds = std::chrono::duration_cast( + std::chrono::system_clock::now() - start_time); + std::cout << "CPI constructed in " << elapsed_milliseconds.count() << ". Matching..." + << std::endl; + return Check(cpi, graph, gfd, core, forest, parent, nte); +} + +} // namespace + +namespace algos { + +std::vector EGfdValidation::GenerateSatisfiedGfds(graph_t const& graph, + std::vector const& gfds) { + for (auto& gfd : gfds) { + if (Validate(graph, gfd)) { + result_.push_back(gfd); + } + } + return result_; +} + +} // namespace algos diff --git a/src/core/algorithms/gfd/egfd_validation.h b/src/core/algorithms/gfd/egfd_validation.h new file mode 100644 index 0000000000..1ca3656c18 --- /dev/null +++ b/src/core/algorithms/gfd/egfd_validation.h @@ -0,0 +1,19 @@ +#pragma once +#include "algorithms/gfd/gfd_handler.h" +#include "config/names_and_descriptions.h" +#include "gfd.h" + +namespace algos { + +using CPI = std::map, std::map>>; + +class EGfdValidation : public GfdHandler { +public: + std::vector GenerateSatisfiedGfds(graph_t const& graph, std::vector const& gfds); + + EGfdValidation() : GfdHandler(){}; + + EGfdValidation(graph_t graph_, std::vector gfds_) : GfdHandler(graph_, gfds_) {} +}; + +} // namespace algos From 07019c623d11bb59db80a604ada8b2b7eff810e8 Mon Sep 17 00:00:00 2001 From: AntonChern Date: Mon, 4 Dec 2023 17:49:25 +0300 Subject: [PATCH 5/6] Implement naive GFD validation algorithm GFD validation algorithm that uses VF2 algorithm to find a subgraph. --- src/core/algorithms/algorithm_types.h | 12 +- src/core/algorithms/algorithms.h | 1 + .../algorithms/gfd/naivegfd_validation.cpp | 132 ++++++++++++++++++ src/core/algorithms/gfd/naivegfd_validation.h | 19 +++ 4 files changed, 159 insertions(+), 5 deletions(-) create mode 100644 src/core/algorithms/gfd/naivegfd_validation.cpp create mode 100644 src/core/algorithms/gfd/naivegfd_validation.h diff --git a/src/core/algorithms/algorithm_types.h b/src/core/algorithms/algorithm_types.h index 313f169622..36500400ed 100644 --- a/src/core/algorithms/algorithm_types.h +++ b/src/core/algorithms/algorithm_types.h @@ -6,10 +6,11 @@ namespace algos { -using AlgorithmTypes = std::tuple; +using AlgorithmTypes = + std::tuple; // clang-format off /* Enumeration of all supported non-pipeline algorithms. If you implement a new @@ -61,7 +62,8 @@ BETTER_ENUM(AlgorithmType, char, /* Graph functional dependency mining algorithms */ gfdvalid, - egfdvalid + egfdvalid, + naivegfdvalid ) // clang-format on diff --git a/src/core/algorithms/algorithms.h b/src/core/algorithms/algorithms.h index 4db60e0c6b..61d68bd51d 100644 --- a/src/core/algorithms/algorithms.h +++ b/src/core/algorithms/algorithms.h @@ -41,3 +41,4 @@ /* Graph functional dependency mining algorithms */ #include "algorithms/gfd/egfd_validation.h" #include "algorithms/gfd/gfd_validation.h" +#include "algorithms/gfd/naivegfd_validation.h" diff --git a/src/core/algorithms/gfd/naivegfd_validation.cpp b/src/core/algorithms/gfd/naivegfd_validation.cpp new file mode 100644 index 0000000000..d00ff47b75 --- /dev/null +++ b/src/core/algorithms/gfd/naivegfd_validation.cpp @@ -0,0 +1,132 @@ +#include "naivegfd_validation.h" + +#include +#include + +#include + +#include "gfd.h" + +namespace { + +struct CheckCallback { +private: + graph_t const& query; + graph_t const& graph; + const std::vector premises; + const std::vector conclusion; + bool& res; + int& amount; + +public: + CheckCallback(graph_t const& query_, graph_t const& graph_, + std::vector const& premises_, std::vector const& conclusion_, + bool& res_, int& amount_) + : query(query_), + graph(graph_), + premises(premises_), + conclusion(conclusion_), + res(res_), + amount(amount_) {} + + template + bool operator()(CorrespondenceMap1To2 f, CorrespondenceMap2To1) const { + amount++; + auto satisfied = [this, &f](std::vector const& literals) { + for (const Literal& l : literals) { + auto fst_token = l.first; + auto snd_token = l.second; + std::string fst; + std::string snd; + if (fst_token.first == -1) { + fst = fst_token.second; + } else { + vertex_t v; + vertex_t u = boost::vertex(fst_token.first, query); + v = get(f, u); + auto attrs = graph[v].attributes; + if (attrs.find(fst_token.second) == attrs.end()) { + return false; + } + fst = attrs.at(fst_token.second); + } + if (snd_token.first == -1) { + snd = snd_token.second; + } else { + vertex_t v; + vertex_t u = boost::vertex(fst_token.first, query); + v = get(f, u); + auto attrs = graph[v].attributes; + if (attrs.find(snd_token.second) == attrs.end()) { + return false; + } + fst = attrs.at(snd_token.second); + } + if (fst != snd) { + return false; + } + } + return true; + }; + + if (!satisfied(premises)) { + return true; + } + if (!satisfied(conclusion)) { + res = false; + return false; + } + return true; + } +}; + +bool Validate(graph_t const& graph, Gfd const& gfd) { + graph_t pattern = gfd.GetPattern(); + + struct VCompare { + graph_t const& pattern; + graph_t const& graph; + + bool operator()(vertex_t fr, vertex_t to) const { + return pattern[fr].attributes.at("label") == graph[to].attributes.at("label"); + } + } vcompare{pattern, graph}; + + struct ECompare { + graph_t const& pattern; + graph_t const& graph; + + bool operator()(edge_t fr, edge_t to) const { + return pattern[fr].label == graph[to].label; + } + } ecompare{pattern, graph}; + + bool res = true; + int amount = 0; + CheckCallback callback(pattern, graph, gfd.GetPremises(), gfd.GetConclusion(), res, amount); + + bool found = boost::vf2_subgraph_iso( + pattern, graph, callback, get(boost::vertex_index, pattern), + get(boost::vertex_index, graph), vertex_order_by_mult(pattern), ecompare, vcompare); + std::cout << "Checked embeddings: " << amount << std::endl; + if (!found) { + return true; + } + return res; +} + +} // namespace + +namespace algos { + +std::vector NaiveGfdValidation::GenerateSatisfiedGfds(graph_t const& graph, + std::vector const& gfds) { + for (auto& gfd : gfds) { + if (Validate(graph, gfd)) { + result_.push_back(gfd); + } + } + return result_; +} + +} // namespace algos diff --git a/src/core/algorithms/gfd/naivegfd_validation.h b/src/core/algorithms/gfd/naivegfd_validation.h new file mode 100644 index 0000000000..e43af06b61 --- /dev/null +++ b/src/core/algorithms/gfd/naivegfd_validation.h @@ -0,0 +1,19 @@ +#pragma once +#include + +#include "algorithms/algorithm.h" +#include "algorithms/gfd/gfd_handler.h" +#include "gfd.h" + +namespace algos { + +class NaiveGfdValidation : public GfdHandler { +public: + std::vector GenerateSatisfiedGfds(graph_t const& graph, std::vector const& gfds); + + NaiveGfdValidation() : GfdHandler(){}; + + NaiveGfdValidation(graph_t graph_, std::vector gfds_) : GfdHandler(graph_, gfds_) {} +}; + +} // namespace algos From 3810ba0511a2bb886678653b49fe8e95152db64f Mon Sep 17 00:00:00 2001 From: AntonChern Date: Mon, 4 Dec 2023 17:52:09 +0300 Subject: [PATCH 6/6] Add tests Input data is presented as dot-files. Undirected graphs are used. --- src/tests/test_gfd_validation.cpp | 63 +++++++++++++++++++ test_input_data/graph_data/directors.dot | 23 +++++++ test_input_data/graph_data/directors_gfd.dot | 7 +++ test_input_data/graph_data/quadrangle.dot | 14 +++++ test_input_data/graph_data/quadrangle_gfd.dot | 7 +++ 5 files changed, 114 insertions(+) create mode 100644 src/tests/test_gfd_validation.cpp create mode 100644 test_input_data/graph_data/directors.dot create mode 100644 test_input_data/graph_data/directors_gfd.dot create mode 100644 test_input_data/graph_data/quadrangle.dot create mode 100644 test_input_data/graph_data/quadrangle_gfd.dot diff --git a/src/tests/test_gfd_validation.cpp b/src/tests/test_gfd_validation.cpp new file mode 100644 index 0000000000..4af54fa16a --- /dev/null +++ b/src/tests/test_gfd_validation.cpp @@ -0,0 +1,63 @@ +#include +#include + +#include "algorithms/algo_factory.h" +#include "algorithms/gfd/gfd_validation.h" +#include "config/names.h" +#include "table_config.h" + +using namespace algos; +using algos::StdParamsMap; + +namespace tests { + +namespace { + +auto current_path = test_data_dir / "graph_data"; + +template +class GfdValidationTest : public ::testing::Test { +protected: + std::unique_ptr CreateGfdValidationInstance( + std::filesystem::path const& graph_path, + std::vector const& gfd_paths) { + StdParamsMap optionMap = {{config::names::kGraphData, graph_path}, + {config::names::kGfdData, gfd_paths}}; + return algos::CreateAndLoadAlgorithm(optionMap); + } +}; + +TYPED_TEST_SUITE_P(GfdValidationTest); + +TYPED_TEST_P(GfdValidationTest, TestTrivially) { + auto graph_path = current_path / "quadrangle.dot"; + auto gfd_path = current_path / "quadrangle_gfd.dot"; + std::vector gfd_paths = {gfd_path}; + auto algorithm = TestFixture::CreateGfdValidationInstance(graph_path, gfd_paths); + int expected_size = 1; + algorithm->Execute(); + std::vector GfdList = algorithm->GfdList(); + ASSERT_EQ(expected_size, GfdList.size()); +} + +TYPED_TEST_P(GfdValidationTest, TestExistingMatches) { + auto graph_path = current_path / "directors.dot"; + auto gfd_path = current_path / "directors_gfd.dot"; + std::vector gfd_paths = {gfd_path}; + auto algorithm = TestFixture::CreateGfdValidationInstance(graph_path, gfd_paths); + int expected_size = 0; + algorithm->Execute(); + std::vector GfdList = algorithm->GfdList(); + ASSERT_EQ(expected_size, GfdList.size()); +} + +REGISTER_TYPED_TEST_SUITE_P(GfdValidationTest, TestTrivially, TestExistingMatches); + +using GfdAlgorithms = + ::testing::Types; + +INSTANTIATE_TYPED_TEST_SUITE_P(GfdValidationTest, GfdValidationTest, GfdAlgorithms); + +} // namespace + +} // namespace tests diff --git a/test_input_data/graph_data/directors.dot b/test_input_data/graph_data/directors.dot new file mode 100644 index 0000000000..25cdc269d5 --- /dev/null +++ b/test_input_data/graph_data/directors.dot @@ -0,0 +1,23 @@ +graph G { +0[label="person" name="James Cameron" celebrity="high"]; +1[label="film" name="Avatar" success="high" year="2009"]; +2[label="film" name="Titanic" success="high" year="1997"]; +3[label="film" name="Piranha II" success="low" year="1981"]; +4[label="film" name="Terminator" success="high" year="1984"]; +5[label="person" name="Robert Zemeckis" celebrity="high"]; +6[label="film" name="The Walk" success="high" year="2015"]; +7[label="film" name="Back to the future" success="high" year="1985"]; +8[label="film" name="Forrest Gump" success="high" year="1994"]; +9[label="person" name="James Toback" celebrity="low"]; +10[label="film" name="Tyson" success="high" year="2008"]; +11[label="film" name="Fingers" success="high" year="1978"]; +0--1 [label="directed"]; +0--2 [label="directed"]; +0--3 [label="directed"]; +0--4 [label="directed"]; +5--6 [label="directed"]; +5--7 [label="directed"]; +5--8 [label="directed"]; +9--10 [label="directed"]; +9--11 [label="directed"]; +} diff --git a/test_input_data/graph_data/directors_gfd.dot b/test_input_data/graph_data/directors_gfd.dot new file mode 100644 index 0000000000..97bbcef324 --- /dev/null +++ b/test_input_data/graph_data/directors_gfd.dot @@ -0,0 +1,7 @@ +0.celebrity=high +1.success=high +graph G { +0[label=person]; +1[label=film]; +0--1 [label=directed]; +} diff --git a/test_input_data/graph_data/quadrangle.dot b/test_input_data/graph_data/quadrangle.dot new file mode 100644 index 0000000000..71e9c7a347 --- /dev/null +++ b/test_input_data/graph_data/quadrangle.dot @@ -0,0 +1,14 @@ +graph G { +0[label=quadrilateral angles=arbitrary sides=arbitrary]; +1[label=parallelogram angles=pairwise_equal sides=pairwise_equal]; +2[label=trapezoid angles=arbitrary sides=parallel_and_arbitrary]; +3[label=rectangle angles=equal sides=pairwise_equal]; +4[label=rhombus angles=pairwise_equal sides=equal]; +5[label=square angles=equal sides=equal]; +0--1 [label=two_pairs_of_parallel_sides]; +0--2 [label=one_pair_of_parallel_sides]; +1--3 [label=equality_of_angles]; +1--4 [label=equality_of_sides]; +3--5 [label=equality_of_sides]; +4--5 [label=equality_of_angles]; +} diff --git a/test_input_data/graph_data/quadrangle_gfd.dot b/test_input_data/graph_data/quadrangle_gfd.dot new file mode 100644 index 0000000000..0ad28e783e --- /dev/null +++ b/test_input_data/graph_data/quadrangle_gfd.dot @@ -0,0 +1,7 @@ + +0.sides=1.sides +graph G { +0[label=polygon]; +1[label=triangle]; +0--1 [label=three_sides]; +}