From 0eb7f93119609154a1541a11dd4e130bcdd18acd Mon Sep 17 00:00:00 2001
From: Adam Kovacs <adaam.ko@gmail.com>
Date: Tue, 8 Feb 2022 17:02:41 +0100
Subject: [PATCH 01/15] Introducing PotatoGraph and penman dump #26

---
 xpotato/dataset/dataset.py         | 116 ++++++++++++++++++++++++-----
 xpotato/dataset/sample.py          |  12 +--
 xpotato/graph_extractor/extract.py |   8 +-
 xpotato/graph_extractor/graph.py   |  16 ++++
 4 files changed, 124 insertions(+), 28 deletions(-)
 create mode 100644 xpotato/graph_extractor/graph.py

diff --git a/xpotato/dataset/dataset.py b/xpotato/dataset/dataset.py
index 249f65f..aa125e1 100644
--- a/xpotato/dataset/dataset.py
+++ b/xpotato/dataset/dataset.py
@@ -1,24 +1,87 @@
-import pickle
+from re import I
 from typing import List, Tuple, Dict
 
 import networkx as nx
 import pandas as pd
 
+from tqdm import tqdm
+from tuw_nlp.graph.utils import graph_to_pn
 from xpotato.dataset.sample import Sample
 from xpotato.graph_extractor.extract import GraphExtractor
+from xpotato.graph_extractor.graph import PotatoGraph
 
 
 class Dataset:
     def __init__(
-        self, examples: List[Tuple[str, str]], label_vocab: Dict[str, int], lang="en"
+        self,
+        examples: List[Tuple[str, str]] = None,
+        label_vocab: Dict[str, int] = {},
+        lang="en",
+        path=None,
+        binary=False,
     ) -> None:
         self.label_vocab = label_vocab
-        self._dataset = self.read_dataset(examples)
+        if path:
+            self._dataset = self.read_dataset(path=path, binary=binary)
+        else:
+            self._dataset = self.read_dataset(examples=examples)
         self.extractor = GraphExtractor(lang=lang, cache_fn=f"{lang}_nlp_cache")
         self.graphs = None
 
-    def read_dataset(self, examples: List[Tuple[str, str]]) -> List[Sample]:
-        return [Sample(example) for example in examples]
+    def read_dataset(
+        self,
+        examples: List[Tuple[str, str]] = None,
+        path: str = None,
+        binary: bool = False,
+    ) -> List[Sample]:
+        if examples:
+            return [Sample(example) for example in examples]
+        elif path:
+            if binary:
+                df = pd.read_pickle(path)
+                graphs = []
+                for i, graph in enumerate(df["graph"].tolist()):
+                    graph.remove_nodes_from(list(nx.isolates(graph)))
+                    # ADAM: THIS IS JUST FOR PICKLE TO PENMAN CONVERSION
+                    graph = self._random_postprocess(graph)
+
+                    g = [
+                        c
+                        for c in sorted(
+                            nx.weakly_connected_components(graph), key=len, reverse=True
+                        )
+                    ]
+                    if len(g) > 1:
+                        print(
+                            "WARNING: graph has multiple connected components, taking the largest"
+                        )
+                        g_pn = graph_to_pn(graph.subgraph(g[0].copy()))
+                    else:
+                        g_pn = graph_to_pn(graph)
+
+                    graphs.append(g_pn)
+                df.drop(columns=["graph"], inplace=True)
+                df["graph"] = graphs
+            else:
+                df = pd.read_csv(path, sep="\t")
+
+            return [
+                Sample(
+                    (example["text"], example["label"]),
+                    PotatoGraph(graph_str=example["graph"]),
+                )
+                for _, example in tqdm(df.iterrows())
+            ]
+        else:
+            raise ValueError("No examples or path provided")
+
+    # ADAM: THIS WILL NEED TO BE ADDRESSED
+    def _random_postprocess(self, graph: nx.DiGraph) -> nx.DiGraph:
+        for node, attr in graph.nodes(data=True):
+            if len(attr["name"].split()) > 1:
+                attr["name"] = attr["name"].split()[0]
+
+        return graph
 
     def to_dataframe(self) -> pd.DataFrame:
         df = pd.DataFrame(
@@ -29,7 +92,7 @@ def to_dataframe(self) -> pd.DataFrame:
                     self.label_vocab[sample.label] if sample.label else None
                     for sample in self._dataset
                 ],
-                "graph": [sample.graph for sample in self._dataset],
+                "graph": [sample.potato_graph.graph for sample in self._dataset],
             }
         )
         return df
@@ -41,23 +104,38 @@ def parse_graphs(self, graph_format: str = "fourlang") -> List[nx.DiGraph]:
             )
         )
 
-        self.graphs = graphs
-        return graphs
+        self.graphs = [PotatoGraph(graph) for graph in graphs]
+        return self.graphs
 
-    def set_graphs(self, graphs: List[nx.DiGraph]) -> None:
-        for sample, graph in zip(self._dataset, graphs):
-            graph.remove_edges_from(nx.selfloop_edges(graph))
-            sample.set_graph(graph)
+    def set_graphs(self, graphs: List[PotatoGraph]) -> None:
+        for sample, potato_graph in zip(self._dataset, graphs):
+            potato_graph.graph.remove_edges_from(nx.selfloop_edges(potato_graph.graph))
+            sample.set_graph(potato_graph)
 
     def load_graphs(self, path: str) -> None:
-        PIK = path
-
-        with open(PIK, "rb") as f:
-            self.graphs = pickle.load(f)
+        with open(path, "rb") as f:
+            for line in f:
+                graph = PotatoGraph()
+                graph.from_penman(line.strip())
+                self.graphs.append(graph)
 
         self.set_graphs(self.graphs)
 
+    def save_dataset(self, path: str) -> None:
+        df = pd.DataFrame(
+            {
+                "text": [sample.text for sample in self._dataset],
+                "label": [sample.label for sample in self._dataset],
+                "label_id": [
+                    self.label_vocab[sample.label] if sample.label else None
+                    for sample in self._dataset
+                ],
+                "graph": [str(sample.potato_graph) for sample in self._dataset],
+            }
+        )
+        df.to_csv(path, index=False, sep="\t")
+
     def save_graphs(self, path: str) -> None:
-        PIK = path
-        with open(PIK, "wb") as f:
-            pickle.dump(self.graphs, f)
+        with open(path, "wb") as f:
+            for graph in self.graphs:
+                f.write(str(graph) + "\n")
diff --git a/xpotato/dataset/sample.py b/xpotato/dataset/sample.py
index a62c171..d517e16 100644
--- a/xpotato/dataset/sample.py
+++ b/xpotato/dataset/sample.py
@@ -1,13 +1,15 @@
 from typing import Tuple
 
-import networkx as nx
+from xpotato.graph_extractor.graph import PotatoGraph
 
 
 class Sample:
-    def __init__(self, example: Tuple[str, str]) -> None:
+    def __init__(
+        self, example: Tuple[str, str], potato_graph: PotatoGraph = None
+    ) -> None:
         self.text = example[0]
         self.label = example[1]
-        self.graph = None
+        self.potato_graph = potato_graph
 
-    def set_graph(self, graph: nx.DiGraph) -> None:
-        self.graph = graph
+    def set_graph(self, graph: PotatoGraph) -> None:
+        self.potato_graph = graph
diff --git a/xpotato/graph_extractor/extract.py b/xpotato/graph_extractor/extract.py
index e613f99..f4ffb51 100644
--- a/xpotato/graph_extractor/extract.py
+++ b/xpotato/graph_extractor/extract.py
@@ -43,7 +43,7 @@ def init_nlp(self):
     def parse_iterable(self, iterable, graph_type="fourlang"):
         if graph_type == "fourlang":
             with TextTo4lang(
-                    lang=self.lang, nlp_cache=self.cache_fn, cache_dir=self.cache_dir
+                lang=self.lang, nlp_cache=self.cache_fn, cache_dir=self.cache_dir
             ) as tfl:
                 for sen in tqdm(iterable):
                     fl_graphs = list(tfl(sen))
@@ -274,8 +274,8 @@ def select_words(self, trained_features):
 
         for word in words_to_measures:
             if words_to_measures[word]["precision"] > 0.9 and (
-                    words_to_measures[word]["TP"] > 1
-                    or words_to_measures[word]["recall"] > 0.01
+                words_to_measures[word]["TP"] > 1
+                or words_to_measures[word]["recall"] > 0.01
             ):
                 selected_words.add(word)
 
@@ -311,7 +311,7 @@ def evaluate_feature(self, cl, features, data, graph_format="ud"):
 
         accuracy = []
         for pcf in precision_recall_fscore_support(
-                labels, whole_predicted, average=None
+            labels, whole_predicted, average=None
         ):
             if len(pcf) > 1:
                 accuracy.append(pcf[1])
diff --git a/xpotato/graph_extractor/graph.py b/xpotato/graph_extractor/graph.py
new file mode 100644
index 0000000..3613f0d
--- /dev/null
+++ b/xpotato/graph_extractor/graph.py
@@ -0,0 +1,16 @@
+import networkx as nx
+from tuw_nlp.graph.utils import graph_to_pn
+from xpotato.dataset.utils import default_pn_to_graph
+
+
+class PotatoGraph:
+    def __init__(self, graph: nx.DiGraph = None, graph_str: str = None) -> None:
+        if graph:
+            self.graph = graph
+        elif graph_str:
+            self.graph, _ = default_pn_to_graph(graph_str)
+        else:
+            self.graph = None
+
+    def __str__(self) -> str:
+        return graph_to_pn(self.graph)

From f84680ffd882c59a143790da3d72b2eb95797219 Mon Sep 17 00:00:00 2001
From: Adam Kovacs <adaam.ko@gmail.com>
Date: Fri, 11 Feb 2022 10:13:18 +0100
Subject: [PATCH 02/15] stricter dependencies #39

---
 setup.py | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/setup.py b/setup.py
index 28a2d55..6837559 100644
--- a/setup.py
+++ b/setup.py
@@ -11,24 +11,20 @@
     author_email="adam.kovacs@tuwien.ac.at, gabor.recski@tuwien.ac.at",
     license="MIT",
     install_requires=[
-        "beautifulsoup4",
-        "tinydb",
-        "pandas",
+        "pandas >= 1.3.5",
         "tqdm",
-        "stanza",
-        "sklearn",
-        "eli5",
-        "matplotlib",
-        "graphviz",
-        "openpyxl",
-        "penman",
-        "networkx >= 2.6.3",
-        "rank_bm25",
+        "stanza == 1.3.0",
+        "scikit-learn == 1.0.2",
+        "eli5 == 0.11.0",
+        "graphviz == 0.18.2",
+        "penman >= 1.2.1",
+        "networkx == 2.6.3",
+        "rank_bm25 == 0.2.1",
         "streamlit == 1.3.1",
-        "streamlit-aggrid",
-        "scikit-criteria >= 0.5",
-        "tuw-nlp",
-        "amrlib",
+        "streamlit-aggrid == 0.2.3.post2",
+        "scikit-criteria == 0.5",
+        "tuw-nlp >= 0.0.4",
+        "amrlib == 0.6.0",
     ],
     packages=find_packages(),
     classifiers=[
@@ -39,8 +35,6 @@
         "License :: OSI Approved :: MIT License",
         "Operating System :: OS Independent",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.7",
-        "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],

From b97f2b26fe692c841c0765a055c19e5efc1be5e1 Mon Sep 17 00:00:00 2001
From: Adam Kovacs <adaam.ko@gmail.com>
Date: Fri, 11 Feb 2022 19:29:32 +0100
Subject: [PATCH 03/15] More on penman format #26

---
 frontend/app.py            |  9 ++---
 frontend/utils.py          | 28 ++++++++------
 xpotato/dataset/dataset.py | 75 +++++++++++++++++++++++---------------
 xpotato/dataset/utils.py   |  8 ++++
 4 files changed, 75 insertions(+), 45 deletions(-)

diff --git a/frontend/app.py b/frontend/app.py
index 4a4d24e..fa442ca 100644
--- a/frontend/app.py
+++ b/frontend/app.py
@@ -22,8 +22,7 @@
     init_extractor,
     init_session_states,
     rank_and_suggest,
-    read_train,
-    read_val,
+    read_df,
     rerun,
     rule_chooser,
     save_ruleset,
@@ -634,7 +633,7 @@ def simple_mode(evaluator, data, val_data, graph_format, feature_path, hand_made
 
 
 def advanced_mode(evaluator, train_data, graph_format, feature_path, hand_made_rules):
-    data = read_train(train_data)
+    data = read_df(train_data)
     if hand_made_rules:
         with open(hand_made_rules) as f:
             st.session_state.features = json.load(f)
@@ -1216,9 +1215,9 @@ def main(args):
     init_session_states()
     evaluator = init_evaluator()
     if args.train_data:
-        data = read_train(args.train_data, args.label)
+        data = read_df(args.train_data, args.label)
     if args.val_data:
-        val_data = read_val(args.val_data, args.label)
+        val_data = read_df(args.val_data, args.label)
     graph_format = args.graph_format
     feature_path = args.suggested_rules
     hand_made_rules = args.hand_rules
diff --git a/frontend/utils.py b/frontend/utils.py
index 8b02038..77d36e6 100644
--- a/frontend/utils.py
+++ b/frontend/utils.py
@@ -13,6 +13,7 @@
 from streamlit.report_thread import REPORT_CONTEXT_ATTR_NAME
 
 from xpotato.dataset.utils import default_pn_to_graph
+from xpotato.graph_extractor.graph import PotatoGraph
 from xpotato.graph_extractor.extract import FeatureEvaluator, GraphExtractor
 from xpotato.models.trainer import GraphTrainer
 from xpotato.dataset.utils import default_pn_to_graph
@@ -229,23 +230,28 @@ def filter_label(df, label):
 
 
 @st.cache(allow_output_mutation=True)
-def read_train(path, label=None):
-    df = pd.read_pickle(path)
+def read_df(path, label=None, binary=False):
+    if binary:
+        df = pd.read_pickle(path)
+    else:
+        df = pd.read_csv(path, sep="\t")
+        graphs = []
+        for graph in df["graph"]:
+            potato_graph = PotatoGraph(graph_str=graph)
+            graphs.append(potato_graph.graph)
+        df["graph"] = graphs
     if label is not None:
         filter_label(df, label)
     return df
 
 
 def save_dataframe(data, path):
-    data.to_pickle(path)
-
-
-@st.cache(allow_output_mutation=True)
-def read_val(path, label=None):
-    df = pd.read_pickle(path)
-    if label is not None:
-        filter_label(df, label)
-    return df
+    if ".pickle" in path:
+        data.to_pickle(path)
+    else:
+        graphs = data["graph"]
+        data["graph"] = [graph_to_pn(graph) for graph in graphs]
+        data.to_csv(path, sep="\t", index=False)
 
 
 def train_df(df, min_edge=0, rank=False):
diff --git a/xpotato/dataset/dataset.py b/xpotato/dataset/dataset.py
index aa125e1..3e1b84f 100644
--- a/xpotato/dataset/dataset.py
+++ b/xpotato/dataset/dataset.py
@@ -28,6 +28,37 @@ def __init__(
         self.extractor = GraphExtractor(lang=lang, cache_fn=f"{lang}_nlp_cache")
         self.graphs = None
 
+    @staticmethod
+    def save_dataframe(df: pd.DataFrame, path: str) -> None:
+        graphs = [graph_to_pn(graph) for graph in df["graph"].tolist()]
+        df["graph"] = graphs
+        df.to_csv(path, index=False, sep="\t")
+
+    def prune_graphs(self, graphs: List[nx.DiGraph] = None) -> None:
+        graphs_str = []
+        for i, graph in enumerate(graphs):
+            graph.remove_nodes_from(list(nx.isolates(graph)))
+            # ADAM: THIS IS JUST FOR PICKLE TO PENMAN CONVERSION
+            graph = self._random_postprocess(graph)
+
+            g = [
+                c
+                for c in sorted(
+                    nx.weakly_connected_components(graph), key=len, reverse=True
+                )
+            ]
+            if len(g) > 1:
+                print(
+                    "WARNING: graph has multiple connected components, taking the largest"
+                )
+                g_pn = graph_to_pn(graph.subgraph(g[0].copy()))
+            else:
+                g_pn = graph_to_pn(graph)
+
+            graphs_str.append(g_pn)
+
+        return graphs_str
+
     def read_dataset(
         self,
         examples: List[Tuple[str, str]] = None,
@@ -35,33 +66,13 @@ def read_dataset(
         binary: bool = False,
     ) -> List[Sample]:
         if examples:
-            return [Sample(example) for example in examples]
+            return [Sample(example, PotatoGraph()) for example in examples]
         elif path:
             if binary:
                 df = pd.read_pickle(path)
-                graphs = []
-                for i, graph in enumerate(df["graph"].tolist()):
-                    graph.remove_nodes_from(list(nx.isolates(graph)))
-                    # ADAM: THIS IS JUST FOR PICKLE TO PENMAN CONVERSION
-                    graph = self._random_postprocess(graph)
-
-                    g = [
-                        c
-                        for c in sorted(
-                            nx.weakly_connected_components(graph), key=len, reverse=True
-                        )
-                    ]
-                    if len(g) > 1:
-                        print(
-                            "WARNING: graph has multiple connected components, taking the largest"
-                        )
-                        g_pn = graph_to_pn(graph.subgraph(g[0].copy()))
-                    else:
-                        g_pn = graph_to_pn(graph)
-
-                    graphs.append(g_pn)
+                graphs_str = self.prune_graphs(df.graph.tolist())
                 df.drop(columns=["graph"], inplace=True)
-                df["graph"] = graphs
+                df["graph"] = graphs_str
             else:
                 df = pd.read_csv(path, sep="\t")
 
@@ -112,12 +123,18 @@ def set_graphs(self, graphs: List[PotatoGraph]) -> None:
             potato_graph.graph.remove_edges_from(nx.selfloop_edges(potato_graph.graph))
             sample.set_graph(potato_graph)
 
-    def load_graphs(self, path: str) -> None:
-        with open(path, "rb") as f:
-            for line in f:
-                graph = PotatoGraph()
-                graph.from_penman(line.strip())
-                self.graphs.append(graph)
+    def load_graphs(self, path: str, binary: bool = False) -> None:
+        if binary:
+            graphs = [graph for graph in pd.read_pickle(path)]
+            graph_str = self.prune_graphs(graphs)
+
+            graphs = [PotatoGraph(graph_str=graph) for graph in graph_str]
+            self.graphs = graphs
+        else:
+            with open(path, "rb") as f:
+                for line in f:
+                    graph = PotatoGraph(graph_str=line.strip())
+                    self.graphs.append(graph)
 
         self.set_graphs(self.graphs)
 
diff --git a/xpotato/dataset/utils.py b/xpotato/dataset/utils.py
index f6c5b94..7cbd107 100644
--- a/xpotato/dataset/utils.py
+++ b/xpotato/dataset/utils.py
@@ -1,8 +1,16 @@
 from collections import defaultdict
 
 import networkx as nx
+import pandas as pd
 import penman as pn
 from tuw_nlp.graph.utils import preprocess_node_alto
+from tuw_nlp.graph.utils import graph_to_pn
+
+
+def save_dataframe(df: pd.DataFrame, path: str) -> None:
+    graphs = [graph_to_pn(graph) for graph in df["graph"].tolist()]
+    df["graph"] = graphs
+    df.to_csv(path, index=False, sep="\t")
 
 
 def ud_to_graph(sen, edge_attr="color"):

From fc39e7d4b3e0d62bc0ae30fc9e115d3376db3cef Mon Sep 17 00:00:00 2001
From: Adam Kovacs <adaam.ko@gmail.com>
Date: Mon, 14 Feb 2022 14:00:12 +0100
Subject: [PATCH 04/15] More on penman graphs #26

---
 scripts/convert_pickle.py  | 30 ++++++++++++++++++++++++++++++
 scripts/evaluate.py        | 15 ++++++++++++---
 xpotato/dataset/dataset.py |  9 ++++-----
 xpotato/dataset/sample.py  | 16 ++++++++++++++--
 xpotato/dataset/utils.py   |  7 ++++---
 5 files changed, 64 insertions(+), 13 deletions(-)
 create mode 100644 scripts/convert_pickle.py

diff --git a/scripts/convert_pickle.py b/scripts/convert_pickle.py
new file mode 100644
index 0000000..de6b85c
--- /dev/null
+++ b/scripts/convert_pickle.py
@@ -0,0 +1,30 @@
+import argparse
+import json
+import logging
+import sys
+
+from xpotato.dataset.dataset import Dataset
+from xpotato.dataset.utils import save_dataframe
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description="")
+    parser.add_argument("-p", "--pickle", type=str, required=True)
+    parser.add_argument("-o", "--output", type=str, required=True)
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+
+    path = args.pickle
+    output = args.output
+
+    dataset = Dataset(path=path, binary=True)
+    df = dataset.to_dataframe()
+
+    save_dataframe(df, output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/evaluate.py b/scripts/evaluate.py
index d4d83c1..29241f6 100644
--- a/scripts/evaluate.py
+++ b/scripts/evaluate.py
@@ -7,6 +7,7 @@
 from sklearn.metrics import classification_report
 
 from xpotato.graph_extractor.extract import FeatureEvaluator
+from xpotato.graph_extractor.graph import PotatoGraph
 
 
 # TODO Adam: This is not the best place for these functions but I didn't want it to be in the frontend.utils
@@ -18,8 +19,16 @@ def filter_label(df, label):
     df["label_id"] = df.apply(lambda x: 0 if x["label"] == "NOT" else 1, axis=1)
 
 
-def read_val(path, label=None):
-    df = pd.read_pickle(path)
+def read_df(path, label=None, binary=False):
+    if binary:
+        df = pd.read_pickle(path)
+    else:
+        df = pd.read_csv(path, sep="\t")
+        graphs = []
+        for graph in df["graph"]:
+            potato_graph = PotatoGraph(graph_str=graph)
+            graphs.append(potato_graph.graph)
+        df["graph"] = graphs
     if label is not None:
         filter_label(df, label)
     return df
@@ -52,7 +61,7 @@ def main():
     )
 
     args = get_args()
-    df = read_val(args.dataset_path, args.label)
+    df = read_df(args.dataset_path, args.label)
 
     with open(args.features) as f:
         features = json.load(f)
diff --git a/xpotato/dataset/dataset.py b/xpotato/dataset/dataset.py
index 3e1b84f..fd1c103 100644
--- a/xpotato/dataset/dataset.py
+++ b/xpotato/dataset/dataset.py
@@ -79,7 +79,8 @@ def read_dataset(
             return [
                 Sample(
                     (example["text"], example["label"]),
-                    PotatoGraph(graph_str=example["graph"]),
+                    potato_graph=PotatoGraph(graph_str=example["graph"]),
+                    label_id=example["label_id"],
                 )
                 for _, example in tqdm(df.iterrows())
             ]
@@ -100,8 +101,7 @@ def to_dataframe(self) -> pd.DataFrame:
                 "text": [sample.text for sample in self._dataset],
                 "label": [sample.label for sample in self._dataset],
                 "label_id": [
-                    self.label_vocab[sample.label] if sample.label else None
-                    for sample in self._dataset
+                    sample.get_label_id(self.label_vocab) for sample in self._dataset
                 ],
                 "graph": [sample.potato_graph.graph for sample in self._dataset],
             }
@@ -144,8 +144,7 @@ def save_dataset(self, path: str) -> None:
                 "text": [sample.text for sample in self._dataset],
                 "label": [sample.label for sample in self._dataset],
                 "label_id": [
-                    self.label_vocab[sample.label] if sample.label else None
-                    for sample in self._dataset
+                    sample.get_label_id(self.label_vocab) for sample in self._dataset
                 ],
                 "graph": [str(sample.potato_graph) for sample in self._dataset],
             }
diff --git a/xpotato/dataset/sample.py b/xpotato/dataset/sample.py
index d517e16..8f0ed0e 100644
--- a/xpotato/dataset/sample.py
+++ b/xpotato/dataset/sample.py
@@ -1,15 +1,27 @@
-from typing import Tuple
+from typing import Dict, Tuple
 
 from xpotato.graph_extractor.graph import PotatoGraph
 
 
 class Sample:
     def __init__(
-        self, example: Tuple[str, str], potato_graph: PotatoGraph = None
+        self,
+        example: Tuple[str, str],
+        potato_graph: PotatoGraph = None,
+        label_id: int = None,
     ) -> None:
         self.text = example[0]
         self.label = example[1]
+        self.label_id = label_id
         self.potato_graph = potato_graph
 
     def set_graph(self, graph: PotatoGraph) -> None:
         self.potato_graph = graph
+
+    def get_label_id(self, label_vocab: Dict[str, int]):
+        if self.label_id is not None:
+            return self.label_id
+        elif self.label and self.label in label_vocab:
+            return label_vocab[self.label]
+        else:
+            return None
diff --git a/xpotato/dataset/utils.py b/xpotato/dataset/utils.py
index 7cbd107..66fa97e 100644
--- a/xpotato/dataset/utils.py
+++ b/xpotato/dataset/utils.py
@@ -8,9 +8,10 @@
 
 
 def save_dataframe(df: pd.DataFrame, path: str) -> None:
-    graphs = [graph_to_pn(graph) for graph in df["graph"].tolist()]
-    df["graph"] = graphs
-    df.to_csv(path, index=False, sep="\t")
+    df_to_save = df.copy()
+    graphs = [graph_to_pn(graph) for graph in df_to_save["graph"].tolist()]
+    df_to_save["graph"] = graphs
+    df_to_save.to_csv(path, index=False, sep="\t")
 
 
 def ud_to_graph(sen, edge_attr="color"):

From c592e9f3e9003334028ccdbb03a1a448bc30696f Mon Sep 17 00:00:00 2001
From: Adam Kovacs <adaam.ko@gmail.com>
Date: Mon, 14 Feb 2022 14:02:04 +0100
Subject: [PATCH 05/15] Update READMEs and downloads #26

---
 README.md                                |  15 +-
 features/botium/saved_features.json      |   1 +
 features/crowdtruth/README.md            |   4 +-
 features/crowdtruth/crowdtruth.ipynb     |  26 +-
 features/crowdtruth/data.sh              |  24 +-
 features/food/README.md                  |   4 +-
 features/food/data.sh                    |  16 +-
 features/food/{food.ipnyb => food.ipynb} |  10 +-
 features/hasoc/README.md                 |   6 +-
 features/hasoc/data.sh                   |  18 +-
 features/semeval/README.md               |   2 +-
 features/semeval/data.sh                 |   4 +-
 notebooks/hasoc_examples.ipynb           | 478 ++++++++++++--
 notebooks/relation_examples.ipynb        | 779 +++++++++--------------
 14 files changed, 812 insertions(+), 575 deletions(-)
 create mode 100644 features/botium/saved_features.json
 rename features/food/{food.ipnyb => food.ipynb} (96%)

diff --git a/README.md b/README.md
index 95286e3..22fe43c 100644
--- a/README.md
+++ b/README.md
@@ -258,13 +258,14 @@ trainer = GraphTrainer(df)
 #extract features
 features = trainer.prepare_and_train()
 
+from xpotato.dataset.utils import save_dataframe
 from sklearn.model_selection import train_test_split
 
 train, val = train_test_split(df, test_size=0.2, random_state=1234)
 
 #save train and validation, this is important for the frontend to work
-train.to_pickle("train_dataset")
-val.to_pickle("val_dataset")
+save_dataframe(train, 'train.tsv')
+save_dataframe(val, 'val.tsv')
 
 import json
 
@@ -287,18 +288,18 @@ with open("graphs.pickle", "wb") as f:
 If the DataFrame is ready with the parsed graphs, the UI can be started to inspect the extracted rules and modify them. The frontend is a streamlit app, the simplest way of starting it is (the training and the validation dataset must be provided):
 
 ```
-streamlit run frontend/app.py -- -t notebooks/train_dataset -v notebooks/val_dataset -g ud
+streamlit run frontend/app.py -- -t notebooks/train.tsv -v notebooks/val.tsv -g ud
 ```
 
 it can be also started with the extracted features:
 
 ```
-streamlit run frontend/app.py -- -t notebooks/train_dataset -v notebooks/val_dataset -g ud -sr notebooks/features.json
+streamlit run frontend/app.py -- -t notebooks/train.tsv -v notebooks/val.tsv -g ud -sr notebooks/features.json
 ```
 
 if you already used the UI and extracted the features manually and you want to load it, you can run:
 ```
-streamlit run frontend/app.py -- -t notebooks/train_dataset -v notebooks/val_dataset -g ud -sr notebooks/features.json -hr notebooks/manual_features.json
+streamlit run frontend/app.py -- -t notebooks/train.tsv -v notebooks/val.tsv -g ud -sr notebooks/features.json -hr notebooks/manual_features.json
 ```
 
 ### Advanced mode
@@ -331,7 +332,7 @@ sentences = [("Governments and industries in nations around the world are pourin
 
 Then, the frontend can be started:
 ```
-streamlit run frontend/app.py -- -t notebooks/unsupervised_dataset -g ud -m advanced
+streamlit run frontend/app.py -- -t notebooks/unsupervised_dataset.tsv -g ud -m advanced
 ```
 
 Once the frontend starts up and you define the labels, you are faced with the annotation interface. You can search elements by clicking on the appropriate column name and applying the desired filter. You can annotate instances by checking the checkbox at the beginning of the line. You can check multiple checkboxs at a time. Once you've selected the utterances you want to annotate, click on the _Annotate_ button. The annotated samples will appear in the lower table. You can clear the annotation of certain elements by selecting them in the second table and clicking _Clear annotation_.
@@ -345,7 +346,7 @@ Once you have some annotated data, you can train rules by clicking the _Train!_
 If you have the features ready and you want to evaluate them on a test set, you can run:
 
 ```python
-python scripts/evaluate.py -t ud -f notebooks/features.json -d notebooks/val_dataset
+python scripts/evaluate.py -t ud -f notebooks/features.json -d notebooks/val.tsv
 ```
 
 The result will be a _csv_ file with the labels and the matched rules.
diff --git a/features/botium/saved_features.json b/features/botium/saved_features.json
new file mode 100644
index 0000000..3fa66a6
--- /dev/null
+++ b/features/botium/saved_features.json
@@ -0,0 +1 @@
+{"OFF": [[["(u_2 / Erotik)"], [], "OFF"], [["(u_12 / blod.*)"], [], "OFF"], [["(u_1 / Schwein)"], [], "OFF"], [["(u_12 / Bloedsinn)"], [], "OFF"], [["(u_22 / sex)"], [], "OFF"], [["(u_1 / scheisse)"], [], "OFF"], [["(u_1 / dumm)"], [], "OFF"], [["(u_1 / sterben)"], [], "OFF"], [["(u1 / .*arsch.*)"], [], "OFF"], [["(u1 / leck)"], [], "OFF"], [["(u1 / toet.*)"], [], "OFF"], [["(u1 / du)", "(u2 / sein)"], [], "OFF"]]}
\ No newline at end of file
diff --git a/features/crowdtruth/README.md b/features/crowdtruth/README.md
index 56f63ea..6275f4d 100644
--- a/features/crowdtruth/README.md
+++ b/features/crowdtruth/README.md
@@ -15,11 +15,11 @@ Prebuilt rule-systems for both the _cause_ and the _treat_ label are also availa
 Then the frontend of POTATO can be started from the __frontend__ directory:
 
 ```bash
-streamlit run app.py -- -t ../features/crowdtruth/crowdtruth_train_dataset_cause_ud.pickle -v ../features/crowdtruth/crowdtruth_dev_dataset_cause_ud.pickle -hr ../features/crowdtruth/crowd_cause_features_ud.json
+streamlit run app.py -- -t ../features/crowdtruth/crowdtruth_train_dataset_cause_ud.tsv -v ../features/crowdtruth/crowdtruth_dev_dataset_cause_ud.tsv -hr ../features/crowdtruth/crowd_cause_features_ud.json
 ```
 
 If you are done building the rule-system, you can evaluate it on the test data, for this run _evaluate.py_ from the _scripts_ directory.
 
 ```bash
-python evaluate.py -t ud -f ../features/crowdtruth/crowd_cause_features_ud.json -d ../features/crowdtruth/crowdtruth_train_dataset_cause_ud.pickle
+python evaluate.py -t ud -f ../features/crowdtruth/crowd_cause_features_ud.json -d ../features/crowdtruth/crowdtruth_train_dataset_cause_ud.tsv
 ```
\ No newline at end of file
diff --git a/features/crowdtruth/crowdtruth.ipynb b/features/crowdtruth/crowdtruth.ipynb
index 375cbd4..e3faed1 100644
--- a/features/crowdtruth/crowdtruth.ipynb
+++ b/features/crowdtruth/crowdtruth.ipynb
@@ -1,5 +1,19 @@
 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "77655d7b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!wget -nc -q -O \"ground_truth_cause.csv\" \"https://raw.githubusercontent.com/CrowdTruth/Medical-Relation-Extraction/master/ground_truth_cause.csv\"\n",
+    "!wget -nc -q -O \"ground_truth_treat.csv\" \"https://raw.githubusercontent.com/CrowdTruth/Medical-Relation-Extraction/master/ground_truth_treat.csv\"\n",
+    "!wget -nc -q -O \"ground_truth_cause.xlsx\" \"https://github.com/CrowdTruth/Medical-Relation-Extraction/blob/master/train_dev_test/ground_truth_cause.xlsx?raw=true\"\n",
+    "!wget -nc -q -O \"ground_truth_treat.xlsx\" \"https://github.com/CrowdTruth/Medical-Relation-Extraction/blob/master/train_dev_test/ground_truth_treat.xlsx?raw=true\"\n",
+    "!wget -nc -q -O \"food_disease_dataset.csv\" \"https://raw.githubusercontent.com/gjorgjinac/food-disease-dataset/main/food_disease_dataset.csv\""
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 16,
@@ -324,16 +338,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "\n",
+    "from xpotato.dataset.utils import save_dataframe\n",
+    "\n",
     "train_df = train_dataset.to_dataframe()\n",
     "dev_df = dev_dataset.to_dataframe()\n",
     "test_df = test_dataset.to_dataframe()\n",
     "\n",
-    "#train_df.to_pickle(\"crowdtruth_train_dataset_treat_fourlang.pickle\")\n",
-    "#dev_df.to_pickle(\"crowdtruth_dev_dataset_treat_fourlang.pickle\")\n",
-    "#test_df.to_pickle(\"crowdtruth_test_dataset_treat_fourlang.pickle\")\n",
-    "train_df.to_pickle(\"crowdtruth_train_dataset_cause_fourlang.pickle\")\n",
-    "dev_df.to_pickle(\"crowdtruth_dev_dataset_cause_fourlang.pickle\")\n",
-    "test_df.to_pickle(\"crowdtruth_test_dataset_cause_fourlang.pickle\")"
+    "save_dataframe(train_df, \"crowdtruth_train_dataset_cause_fourlang.tsv\")\n",
+    "save_dataframe(dev_df, \"crowdtruth_dev_dataset_cause_fourlang.tsv\")\n",
+    "save_dataframe(test_df, \"crowdtruth_test_dataset_cause_fourlang.tsv\")"
    ]
   },
   {
diff --git a/features/crowdtruth/data.sh b/features/crowdtruth/data.sh
index 1e80806..1a1b04b 100644
--- a/features/crowdtruth/data.sh
+++ b/features/crowdtruth/data.sh
@@ -1,12 +1,12 @@
-wget https://owncloud.tuwien.ac.at/index.php/s/z3IMX2fUNM7Kw6i/download -O crowdtruth_dev_dataset_cause_fourlang.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/C4MOznjvxpcU5Ik/download -O crowdtruth_dev_dataset_cause_ud.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/39s2AsFYTL3Keni/download -O crowdtruth_dev_dataset_treat_fourlang.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/RqC1SzWhRXoKOnn/download -O crowdtruth_dev_dataset_treat_ud.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/WpxGeblkiEhkIib/download -O crowdtruth_test_dataset_cause_fourlang.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/wro8yTxXYK6WpF8/download -O crowdtruth_test_dataset_cause_ud.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/xLz0fOxjb8ORBlR/download -O crowdtruth_test_dataset_treat_fourlang.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/RaCcWl0xVdVpPQZ/download -O crowdtruth_test_dataset_treat_ud.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/i7BuiCMvYWcZlI1/download -O crowdtruth_train_dataset_cause_fourlang.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/NAHY0g1XqYM28LQ/download -O crowdtruth_train_dataset_cause_ud.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/OPzP4kgD4PVwZOA/download -O crowdtruth_train_dataset_treat_fourlang.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/sL3s3uaUgnLdKsy/download -O crowdtruth_train_dataset_treat_ud.pickle
+wget https://owncloud.tuwien.ac.at/index.php/s/aHX8ByPg8nN3W5v/download -O crowdtruth_dev_dataset_cause_fourlang.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/1P1OppoaeFPk4iI/download -O crowdtruth_dev_dataset_cause_ud.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/imAYGbrNVtTHCRs/download -O crowdtruth_dev_dataset_treat_fourlang.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/oOOZhWVjC40xxQm/download -O crowdtruth_dev_dataset_treat_ud.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/C2SQeWPqDdQrtXQ/download -O crowdtruth_test_dataset_cause_fourlang.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/3PGrMU6SINTSbfl/download -O crowdtruth_test_dataset_cause_ud.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/hDyM5x4XCcqANt3/download -O crowdtruth_test_dataset_treat_fourlang.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/SGv5zZm5UyulXT1/download -O crowdtruth_test_dataset_treat_ud.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/KcpBVwigbB19H56/download -O crowdtruth_train_dataset_cause_fourlang.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/tjLqzSUl0zU32zu/download -O crowdtruth_train_dataset_cause_ud.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/0cDVR9nz0I4QWvp/download -O crowdtruth_train_dataset_treat_fourlang.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/PTOhXBqxrLmrAzW/download -O crowdtruth_train_dataset_treat_ud.tsv
diff --git a/features/food/README.md b/features/food/README.md
index 90484de..984ab0a 100644
--- a/features/food/README.md
+++ b/features/food/README.md
@@ -15,11 +15,11 @@ Prebuilt rule-systems for both the _cause_ and the _treat_ label are also availa
 Then the frontend of POTATO can be started from the __frontend__ directory:
 
 ```bash
-streamlit run app.py -- -t ../features/food/food_train_dataset_cause_ud.pickle -v ../features/food/food_dev_dataset_cause_ud.pickle -hr ../features/crowdtruth/food_cause_features_ud.json
+streamlit run app.py -- -t ../features/food/food_train_dataset_cause_ud.tsv -v ../features/food/food_dev_dataset_cause_ud.tsv -hr ../features/crowdtruth/food_cause_features_ud.json
 ```
 
 If you are done building the rule-system, you can evaluate it on the test data, for this run _evaluate.py_ from the _scripts_ directory.
 
 ```bash
-python evaluate.py -t ud -f ../features/food/food_cause_features_ud.json -d ../features/crowdtruth/food_train_dataset_cause_ud.pickle
+python evaluate.py -t ud -f ../features/food/food_cause_features_ud.json -d ../features/crowdtruth/food_train_dataset_cause_ud.tsv
 ```
\ No newline at end of file
diff --git a/features/food/data.sh b/features/food/data.sh
index aa21fdd..34a183b 100644
--- a/features/food/data.sh
+++ b/features/food/data.sh
@@ -1,8 +1,8 @@
-wget https://owncloud.tuwien.ac.at/index.php/s/G8pbpWQq6bqYbXp/download -O food_dev_dataset_cause_fourlang.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/zNlkmijP6T0bRT5/download -O food_dev_dataset_cause_ud.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/lJIRnQBkhyn8bQs/download -O food_dev_dataset_treat_fourlang.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/Nj9vpcBs2C4aFMW/download -O food_dev_dataset_treat_ud.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/WFoTXbRrtn1QDqT/download -O food_test_dataset_cause_fourlang.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/dEvaQhhCQ39e2hv/download -O food_test_dataset_cause_ud.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/A9U3iz5SzGwmdW6/download -O food_test_dataset_treat_fourlang.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/d4Q09GVI89XwKuD/download -O food_test_dataset_treat_ud.pickle
+wget https://owncloud.tuwien.ac.at/index.php/s/eQHmVCULV3sYVKF/download -O food_dev_dataset_cause_fourlang.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/Jem0O20atHYJYkf/download -O food_dev_dataset_cause_ud.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/62v47pY8KwBwlJj/download -O food_dev_dataset_treat_fourlang.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/3KSW4JUJRcUp5zA/download -O food_dev_dataset_treat_ud.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/EC8qjI6Jo1BTaJ4/download -O food_test_dataset_cause_fourlang.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/LWoP5x2DD0QzM2p/download -O food_test_dataset_cause_ud.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/b8DILcmjJhH7IgP/download -O food_test_dataset_treat_fourlang.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/CDmcKXJlcRv8Wcv/download -O food_test_dataset_treat_ud.tsv
diff --git a/features/food/food.ipnyb b/features/food/food.ipynb
similarity index 96%
rename from features/food/food.ipnyb
rename to features/food/food.ipynb
index 96d020a..587788f 100644
--- a/features/food/food.ipnyb
+++ b/features/food/food.ipynb
@@ -209,8 +209,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "train_df.to_pickle(\"food_train_dataset_treat_ud.pickle\")\n",
-    "dev_df.to_pickle(\"food_dev_dataset_treat_ud.pickle\")"
+    "from xpotato.dataset.utils import save_dataframe\n",
+    "\n",
+    "save_dataframe(train_df, 'food_train_dataset_treat_ud.tsv')\n",
+    "save_dataframe(dev_df, 'food_dev_dataset_treat_ud.tsv')"
    ]
   },
   {
@@ -255,8 +257,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "train_df.to_pickle(\"food_train_dataset_cause_fourlang.pickle\")\n",
-    "dev_df.to_pickle(\"food_dev_dataset_cause_fourang.pickle\")"
+    "save_dataframe(train_df, 'food_train_dataset_cause_fourlang.tsv')\n",
+    "save_dataframe(dev_df, 'food_dev_dataset_cause_fourlang.tsv')"
    ]
   },
   {
diff --git a/features/hasoc/README.md b/features/hasoc/README.md
index 8931b91..7f3323b 100644
--- a/features/hasoc/README.md
+++ b/features/hasoc/README.md
@@ -15,18 +15,18 @@ Prebuilt rule-systems are available in this directory for the _2019, 2020, 2021_
 Then the frontend of POTATO can be started from the __frontend__ directory:
 
 ```bash
-streamlit run app.py -- -t ../features/hasoc/hasoc_2021_train_amr.pickle -v ../features/hasoc/hasoc_2021_val_amr.pickle -hr ../features/hasoc/2021_train_features_task1.json
+streamlit run app.py -- -t ../features/hasoc/hasoc_2021_train_amrtsv -v ../features/hasoc/hasoc_2021_val_amr.tsv -hr ../features/hasoc/2021_train_features_task1.json
 ```
 
 If you want to reproduce our output run _evaluate.py_ from the _scripts_ directory.
 
 ```bash
-python evaluate.py -t amr -f ../features/hasoc/2021_train_features_task1.json -d ../features/hasoc/hasoc_2021_test_amr.pickle
+python evaluate.py -t amr -f ../features/hasoc/2021_train_features_task1.json -d ../features/hasoc/hasoc_2021_test_amr.tsv
 ```
 
 If you want to get the classification report, run the script with the __mode__ (-m) parameter:
 ```bash
-python evaluate.py -t amr -f ../features/hasoc/2021_train_features_task1.json -d ../features/hasoc/hasoc_2021_test_amr.pickle -m report
+python evaluate.py -t amr -f ../features/hasoc/2021_train_features_task1.json -d ../features/hasoc/hasoc_2021_test_amr.tsv -m report
 ```
 
 ## Usage and examples on the HASOC data
diff --git a/features/hasoc/data.sh b/features/hasoc/data.sh
index e828a31..c78dc9b 100644
--- a/features/hasoc/data.sh
+++ b/features/hasoc/data.sh
@@ -1,9 +1,9 @@
-wget https://owncloud.tuwien.ac.at/index.php/s/VChBRMu2CghoVEB/download -O hasoc_2019_val_amr.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/80ndwqwAnIqkTKt/download -O hasoc_2019_test_amr.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/PtD2aqtuJtzUoH2/download -O hasoc_2019_train_amr.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/gzlHeqNkp95ehLH/download -O hasoc_2020_val_amr.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/RtiiwCjpyJ1pqdu/download -O hasoc_2020_test_amr.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/mngqfVDaTsW7odk/download -O hasoc_2020_train_amr.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/paqXOSj7bbMd5ZI/download -O hasoc_2021_val_amr.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/oocwRTd0XRhgFYd/download -O hasoc_2021_test_amr.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/Khv85ErE6s0cSAc/download -O hasoc_2021_train_amr.pickle
\ No newline at end of file
+wget https://owncloud.tuwien.ac.at/index.php/s/sUHFGNdvphCUZsQ/download -O hasoc_2019_val_amr.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/PsaHO8N02K9u8sp/download -O hasoc_2019_test_amr.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/QLsQaME33zdT5Xw/download -O hasoc_2019_train_amr.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/Um7BjFu5847yXmd/download -O hasoc_2020_val_amr.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/47HQ9sKo5PmTCTH/download -O hasoc_2020_test_amr.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/hQ56wvpRKxUzVi8/download -O hasoc_2020_train_amr.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/2w8VNtqm7PXTgTX/download -O hasoc_2021_val_amr.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/5Y1V67KMwNMmLC8/download -O hasoc_2021_test_amr.tsv
+wget https://owncloud.tuwien.ac.at/index.php/s/rhTbyW1CbfQuWk0/download -O hasoc_2021_train_amr.tsv
\ No newline at end of file
diff --git a/features/semeval/README.md b/features/semeval/README.md
index 0649a85..a55900f 100644
--- a/features/semeval/README.md
+++ b/features/semeval/README.md
@@ -13,5 +13,5 @@ bash data.sh
 Then the frontend of POTATO can be started from the __frontend__ directory:
 
 ```bash
-streamlit run app.py -- -t ../features/semeval/semeval_train.pickle -v ../features/semeval/semeval_val.pickle
+streamlit run app.py -- -t ../features/semeval/semeval_train.tsv -v ../features/semeval/semeval_val.tsv
 ```
\ No newline at end of file
diff --git a/features/semeval/data.sh b/features/semeval/data.sh
index 8c6a37f..cdb630a 100644
--- a/features/semeval/data.sh
+++ b/features/semeval/data.sh
@@ -1,4 +1,4 @@
-wget https://owncloud.tuwien.ac.at/index.php/s/6gHDG8XArRuyzDc/download -O semeval_train.pickle
+wget https://owncloud.tuwien.ac.at/index.php/s/OgNbqmkUgmcmCTA/download -O semeval_train.tsv
 wget https://owncloud.tuwien.ac.at/index.php/s/2ESe3bVKiSjZ8jJ/download -O semeval_train.txt
 wget https://owncloud.tuwien.ac.at/index.php/s/Nx3p4BG9xx7FHVQ/download -O semeval_train_4lang_graphs.pickle
-wget https://owncloud.tuwien.ac.at/index.php/s/iX8Fmfsyf6vml6t/download -O semeval_val.pickle
+wget https://owncloud.tuwien.ac.at/index.php/s/OgNbqmkUgmcmCTA/download -O semeval_val.tsv
diff --git a/notebooks/hasoc_examples.ipynb b/notebooks/hasoc_examples.ipynb
index 7b6d348..079323a 100644
--- a/notebooks/hasoc_examples.ipynb
+++ b/notebooks/hasoc_examples.ipynb
@@ -394,28 +394,31 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2021-12-01 17:19:46 INFO: Loading these models for language: en (English):\n",
-      "=========================\n",
-      "| Processor | Package   |\n",
-      "-------------------------\n",
-      "| tokenize  | ewt       |\n",
-      "| pos       | ewt       |\n",
-      "| lemma     | ewt       |\n",
-      "| depparse  | ewt       |\n",
-      "| sentiment | sstplus   |\n",
-      "| ner       | ontonotes |\n",
-      "=========================\n",
+      "2022-02-14 13:47:31,501 : core (112) - INFO - Loading these models for language: en (English):\n",
+      "============================\n",
+      "| Processor    | Package   |\n",
+      "----------------------------\n",
+      "| tokenize     | combined  |\n",
+      "| pos          | combined  |\n",
+      "| lemma        | combined  |\n",
+      "| depparse     | combined  |\n",
+      "| sentiment    | sstplus   |\n",
+      "| constituency | wsj       |\n",
+      "| ner          | ontonotes |\n",
+      "============================\n",
       "\n",
-      "2021-12-01 17:19:46 INFO: Use device: cpu\n",
-      "2021-12-01 17:19:46 INFO: Loading: tokenize\n",
-      "2021-12-01 17:19:46 INFO: Loading: pos\n",
-      "2021-12-01 17:19:48 INFO: Loading: lemma\n",
-      "2021-12-01 17:19:48 INFO: Loading: depparse\n",
-      "2021-12-01 17:19:49 INFO: Loading: sentiment\n",
-      "2021-12-01 17:19:50 INFO: Loading: ner\n",
-      "2021-12-01 17:19:51 INFO: Done loading processors!\n",
-      "WARNING:root:creating new NLP cache in en_nlp_cache\n",
-      "100%|███████████████████████████████████████████████████████████████████████████████████| 16/16 [00:07<00:00,  2.00it/s]\n"
+      "2022-02-14 13:47:31,505 : core (123) - INFO - Use device: cpu\n",
+      "2022-02-14 13:47:31,506 : core (129) - INFO - Loading: tokenize\n",
+      "2022-02-14 13:47:31,513 : core (129) - INFO - Loading: pos\n",
+      "2022-02-14 13:47:31,689 : core (129) - INFO - Loading: lemma\n",
+      "2022-02-14 13:47:31,718 : core (129) - INFO - Loading: depparse\n",
+      "2022-02-14 13:47:32,032 : core (129) - INFO - Loading: sentiment\n",
+      "2022-02-14 13:47:32,275 : core (129) - INFO - Loading: constituency\n",
+      "2022-02-14 13:47:32,591 : core (129) - INFO - Loading: ner\n",
+      "2022-02-14 13:47:33,039 : core (179) - INFO - Done loading processors!\n",
+      "2022-02-14 13:47:33,041 : pipeline (40) - INFO - loading NLP cache from en_nlp_cache...\n",
+      "2022-02-14 13:47:33,051 : pipeline (42) - INFO - done!\n",
+      "100%|█████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 2886.78it/s]\n"
      ]
     }
    ],
@@ -495,7 +498,7 @@
        "      <td>RT [USER]: America is the most fucked up count...</td>\n",
        "      <td>HOF</td>\n",
        "      <td>1</td>\n",
-       "      <td>(1, 0, 2, 3, 4, 5, 7, 6, 8, 9, 10)</td>\n",
+       "      <td>(1, 0, 2, 3, 4, 5, 6, 12, 7, 8, 9, 10, 11, 13,...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
@@ -523,7 +526,7 @@
        "      <td>Bitch YES [URL]</td>\n",
        "      <td>HOF</td>\n",
        "      <td>1</td>\n",
-       "      <td>(1, 0, 2, 4, 3, 5)</td>\n",
+       "      <td>(1, 0, 2, 3, 4, 5)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
@@ -537,28 +540,28 @@
        "      <td>RT [USER]: im not fine, i need you</td>\n",
        "      <td>NOT</td>\n",
        "      <td>0</td>\n",
-       "      <td>(1, 0, 2, 3, 4, 5, 6, 7, 8)</td>\n",
+       "      <td>(1, 0, 2, 3, 4, 5, 8, 6, 7, 9, 11, 10, 12)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
        "      <td>Holy shit.. 3 months and I'll be in Italy</td>\n",
        "      <td>HOF</td>\n",
        "      <td>1</td>\n",
-       "      <td>(1, 2, 0, 3, 4, 5, 6, 11, 7, 8, 9, 10)</td>\n",
+       "      <td>(1, 2, 0, 3, 8, 4, 5, 6, 7)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>10</th>\n",
        "      <td>Now I do what I want 🤪</td>\n",
        "      <td>NOT</td>\n",
        "      <td>0</td>\n",
-       "      <td>(1, 3, 2, 0, 4, 5, 6, 7)</td>\n",
+       "      <td>(1, 4, 2, 3, 0, 5, 6, 7)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>11</th>\n",
        "      <td>[USER] you'd immediately stop</td>\n",
        "      <td>NOT</td>\n",
        "      <td>0</td>\n",
-       "      <td>(1, 2, 0, 3, 4, 7, 5, 6)</td>\n",
+       "      <td>(1, 2, 7, 3, 4, 5, 6, 0)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>12</th>\n",
@@ -572,7 +575,7 @@
        "      <td>RT [USER]: ohhhh shit a [USER] [URL]</td>\n",
        "      <td>HOF</td>\n",
        "      <td>1</td>\n",
-       "      <td>(1, 0, 2, 3, 4, 5, 6, 7, 8, 10, 9, 11)</td>\n",
+       "      <td>(1, 0, 2, 3, 4, 5, 6, 10, 7, 8, 9, 11, 12, 13,...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>14</th>\n",
@@ -614,18 +617,18 @@
        "                                                graph  \n",
        "0                               (1, 0, 2, 3, 4, 5, 6)  \n",
        "1   (1, 3, 2, 0, 4, 5, 8, 6, 7, 9, 10, 11, 13, 12,...  \n",
-       "2                  (1, 0, 2, 3, 4, 5, 7, 6, 8, 9, 10)  \n",
+       "2   (1, 0, 2, 3, 4, 5, 6, 12, 7, 8, 9, 10, 11, 13,...  \n",
        "3   (1, 4, 2, 3, 0, 5, 7, 6, 8, 10, 9, 11, 12, 13,...  \n",
        "4                  (1, 3, 2, 0, 4, 7, 5, 6, 8, 9, 10)  \n",
        "5                                        (1, 0, 2, 3)  \n",
-       "6                                  (1, 0, 2, 4, 3, 5)  \n",
+       "6                                  (1, 0, 2, 3, 4, 5)  \n",
        "7                                  (1, 2, 0, 3, 4, 5)  \n",
-       "8                         (1, 0, 2, 3, 4, 5, 6, 7, 8)  \n",
-       "9              (1, 2, 0, 3, 4, 5, 6, 11, 7, 8, 9, 10)  \n",
-       "10                           (1, 3, 2, 0, 4, 5, 6, 7)  \n",
-       "11                           (1, 2, 0, 3, 4, 7, 5, 6)  \n",
+       "8          (1, 0, 2, 3, 4, 5, 8, 6, 7, 9, 11, 10, 12)  \n",
+       "9                         (1, 2, 0, 3, 8, 4, 5, 6, 7)  \n",
+       "10                           (1, 4, 2, 3, 0, 5, 6, 7)  \n",
+       "11                           (1, 2, 7, 3, 4, 5, 6, 0)  \n",
        "12                              (1, 3, 2, 0, 4, 5, 6)  \n",
-       "13             (1, 0, 2, 3, 4, 5, 6, 7, 8, 10, 9, 11)  \n",
+       "13  (1, 0, 2, 3, 4, 5, 6, 10, 7, 8, 9, 11, 12, 13,...  \n",
        "14                     (1, 4, 2, 3, 0, 5, 8, 6, 7, 9)  \n",
        "15                                          (1, 0, 2)  "
       ]
@@ -682,7 +685,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "16it [00:00, 1383.72it/s]\n"
+      "16it [00:00, 8034.10it/s]\n"
      ]
     },
     {
@@ -874,7 +877,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "16it [00:00, 2464.97it/s]\n"
+      "16it [00:00, 6796.52it/s]\n"
      ]
     },
     {
@@ -1066,7 +1069,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "16it [00:00, 2086.65it/s]\n"
+      "16it [00:00, 3718.15it/s]\n"
      ]
     },
     {
@@ -1258,7 +1261,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "16it [00:00, 4242.56it/s]\n"
+      "16it [00:00, 7336.71it/s]\n"
      ]
     },
     {
@@ -1464,6 +1467,203 @@
   {
    "cell_type": "code",
    "execution_count": 18,
+   "id": "f066ebee",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "      <th>label</th>\n",
+       "      <th>label_id</th>\n",
+       "      <th>graph</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>fuck absolutely everything about today.</td>\n",
+       "      <td>HOF</td>\n",
+       "      <td>1</td>\n",
+       "      <td>(1, 0, 2, 3, 4, 5, 6)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>I just made food and I'm making myself sick to...</td>\n",
+       "      <td>HOF</td>\n",
+       "      <td>1</td>\n",
+       "      <td>(1, 3, 2, 0, 4, 5, 8, 6, 7, 9, 10, 11, 13, 12,...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>RT [USER]: America is the most fucked up count...</td>\n",
+       "      <td>HOF</td>\n",
+       "      <td>1</td>\n",
+       "      <td>(1, 0, 2, 3, 4, 5, 6, 12, 7, 8, 9, 10, 11, 13,...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>you'd be blind to not see the heart eyes i hav...</td>\n",
+       "      <td>NOT</td>\n",
+       "      <td>0</td>\n",
+       "      <td>(1, 4, 2, 3, 0, 5, 7, 6, 8, 10, 9, 11, 12, 13,...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>It's hard for me to give a fuck now</td>\n",
+       "      <td>HOF</td>\n",
+       "      <td>1</td>\n",
+       "      <td>(1, 3, 2, 0, 4, 7, 5, 6, 8, 9, 10)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>tell me everything</td>\n",
+       "      <td>NOT</td>\n",
+       "      <td>0</td>\n",
+       "      <td>(1, 0, 2, 3)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Bitch YES [URL]</td>\n",
+       "      <td>HOF</td>\n",
+       "      <td>1</td>\n",
+       "      <td>(1, 0, 2, 3, 4, 5)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Eight people a minute....</td>\n",
+       "      <td>NOT</td>\n",
+       "      <td>0</td>\n",
+       "      <td>(1, 2, 0, 3, 4, 5)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>RT [USER]: im not fine, i need you</td>\n",
+       "      <td>NOT</td>\n",
+       "      <td>0</td>\n",
+       "      <td>(1, 0, 2, 3, 4, 5, 8, 6, 7, 9, 11, 10, 12)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Holy shit.. 3 months and I'll be in Italy</td>\n",
+       "      <td>HOF</td>\n",
+       "      <td>1</td>\n",
+       "      <td>(1, 2, 0, 3, 8, 4, 5, 6, 7)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Now I do what I want 🤪</td>\n",
+       "      <td>NOT</td>\n",
+       "      <td>0</td>\n",
+       "      <td>(1, 4, 2, 3, 0, 5, 6, 7)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>[USER] you'd immediately stop</td>\n",
+       "      <td>NOT</td>\n",
+       "      <td>0</td>\n",
+       "      <td>(1, 2, 7, 3, 4, 5, 6, 0)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>Just... shut the fuck up</td>\n",
+       "      <td>HOF</td>\n",
+       "      <td>1</td>\n",
+       "      <td>(1, 3, 2, 0, 4, 5, 6)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>RT [USER]: ohhhh shit a [USER] [URL]</td>\n",
+       "      <td>HOF</td>\n",
+       "      <td>1</td>\n",
+       "      <td>(1, 0, 2, 3, 4, 5, 6, 10, 7, 8, 9, 11, 12, 13,...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>all i want is for yara to survive tonight</td>\n",
+       "      <td>NOT</td>\n",
+       "      <td>0</td>\n",
+       "      <td>(1, 4, 2, 3, 0, 5, 8, 6, 7, 9)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>fuck them</td>\n",
+       "      <td>HOF</td>\n",
+       "      <td>1</td>\n",
+       "      <td>(1, 0, 2)</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                 text label  label_id  \\\n",
+       "0             fuck absolutely everything about today.   HOF         1   \n",
+       "1   I just made food and I'm making myself sick to...   HOF         1   \n",
+       "2   RT [USER]: America is the most fucked up count...   HOF         1   \n",
+       "3   you'd be blind to not see the heart eyes i hav...   NOT         0   \n",
+       "4                 It's hard for me to give a fuck now   HOF         1   \n",
+       "5                                  tell me everything   NOT         0   \n",
+       "6                                     Bitch YES [URL]   HOF         1   \n",
+       "7                           Eight people a minute....   NOT         0   \n",
+       "8                  RT [USER]: im not fine, i need you   NOT         0   \n",
+       "9           Holy shit.. 3 months and I'll be in Italy   HOF         1   \n",
+       "10                             Now I do what I want 🤪   NOT         0   \n",
+       "11                      [USER] you'd immediately stop   NOT         0   \n",
+       "12                           Just... shut the fuck up   HOF         1   \n",
+       "13               RT [USER]: ohhhh shit a [USER] [URL]   HOF         1   \n",
+       "14          all i want is for yara to survive tonight   NOT         0   \n",
+       "15                                          fuck them   HOF         1   \n",
+       "\n",
+       "                                                graph  \n",
+       "0                               (1, 0, 2, 3, 4, 5, 6)  \n",
+       "1   (1, 3, 2, 0, 4, 5, 8, 6, 7, 9, 10, 11, 13, 12,...  \n",
+       "2   (1, 0, 2, 3, 4, 5, 6, 12, 7, 8, 9, 10, 11, 13,...  \n",
+       "3   (1, 4, 2, 3, 0, 5, 7, 6, 8, 10, 9, 11, 12, 13,...  \n",
+       "4                  (1, 3, 2, 0, 4, 7, 5, 6, 8, 9, 10)  \n",
+       "5                                        (1, 0, 2, 3)  \n",
+       "6                                  (1, 0, 2, 3, 4, 5)  \n",
+       "7                                  (1, 2, 0, 3, 4, 5)  \n",
+       "8          (1, 0, 2, 3, 4, 5, 8, 6, 7, 9, 11, 10, 12)  \n",
+       "9                         (1, 2, 0, 3, 8, 4, 5, 6, 7)  \n",
+       "10                           (1, 4, 2, 3, 0, 5, 6, 7)  \n",
+       "11                           (1, 2, 7, 3, 4, 5, 6, 0)  \n",
+       "12                              (1, 3, 2, 0, 4, 5, 6)  \n",
+       "13  (1, 0, 2, 3, 4, 5, 6, 10, 7, 8, 9, 11, 12, 13,...  \n",
+       "14                     (1, 4, 2, 3, 0, 5, 8, 6, 7, 9)  \n",
+       "15                                          (1, 0, 2)  "
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
    "id": "50f43ae8",
    "metadata": {},
    "outputs": [],
@@ -1475,18 +1675,181 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 20,
    "id": "a672214c",
    "metadata": {},
    "outputs": [],
    "source": [
-    "train.to_pickle(\"train_dataset\")\n",
-    "val.to_pickle(\"val_dataset\")"
+    "from xpotato.dataset.utils import save_dataframe\n",
+    "\n",
+    "save_dataframe(train, \"train_dataset.tsv\")\n",
+    "save_dataframe(val, \"val_dataset.tsv\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 21,
+   "id": "a33b3122",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "      <th>label</th>\n",
+       "      <th>label_id</th>\n",
+       "      <th>graph</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>all i want is for yara to survive tonight</td>\n",
+       "      <td>NOT</td>\n",
+       "      <td>0</td>\n",
+       "      <td>(1, 4, 2, 3, 0, 5, 8, 6, 7, 9)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>RT [USER]: America is the most fucked up count...</td>\n",
+       "      <td>HOF</td>\n",
+       "      <td>1</td>\n",
+       "      <td>(1, 0, 2, 3, 4, 5, 6, 12, 7, 8, 9, 10, 11, 13,...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Now I do what I want 🤪</td>\n",
+       "      <td>NOT</td>\n",
+       "      <td>0</td>\n",
+       "      <td>(1, 4, 2, 3, 0, 5, 6, 7)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Eight people a minute....</td>\n",
+       "      <td>NOT</td>\n",
+       "      <td>0</td>\n",
+       "      <td>(1, 2, 0, 3, 4, 5)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>I just made food and I'm making myself sick to...</td>\n",
+       "      <td>HOF</td>\n",
+       "      <td>1</td>\n",
+       "      <td>(1, 3, 2, 0, 4, 5, 8, 6, 7, 9, 10, 11, 13, 12,...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Holy shit.. 3 months and I'll be in Italy</td>\n",
+       "      <td>HOF</td>\n",
+       "      <td>1</td>\n",
+       "      <td>(1, 2, 0, 3, 8, 4, 5, 6, 7)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>RT [USER]: im not fine, i need you</td>\n",
+       "      <td>NOT</td>\n",
+       "      <td>0</td>\n",
+       "      <td>(1, 0, 2, 3, 4, 5, 8, 6, 7, 9, 11, 10, 12)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>It's hard for me to give a fuck now</td>\n",
+       "      <td>HOF</td>\n",
+       "      <td>1</td>\n",
+       "      <td>(1, 3, 2, 0, 4, 7, 5, 6, 8, 9, 10)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>tell me everything</td>\n",
+       "      <td>NOT</td>\n",
+       "      <td>0</td>\n",
+       "      <td>(1, 0, 2, 3)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Bitch YES [URL]</td>\n",
+       "      <td>HOF</td>\n",
+       "      <td>1</td>\n",
+       "      <td>(1, 0, 2, 3, 4, 5)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>you'd be blind to not see the heart eyes i hav...</td>\n",
+       "      <td>NOT</td>\n",
+       "      <td>0</td>\n",
+       "      <td>(1, 4, 2, 3, 0, 5, 7, 6, 8, 10, 9, 11, 12, 13,...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>fuck them</td>\n",
+       "      <td>HOF</td>\n",
+       "      <td>1</td>\n",
+       "      <td>(1, 0, 2)</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                 text label  label_id  \\\n",
+       "14          all i want is for yara to survive tonight   NOT         0   \n",
+       "2   RT [USER]: America is the most fucked up count...   HOF         1   \n",
+       "10                             Now I do what I want 🤪   NOT         0   \n",
+       "7                           Eight people a minute....   NOT         0   \n",
+       "1   I just made food and I'm making myself sick to...   HOF         1   \n",
+       "9           Holy shit.. 3 months and I'll be in Italy   HOF         1   \n",
+       "8                  RT [USER]: im not fine, i need you   NOT         0   \n",
+       "4                 It's hard for me to give a fuck now   HOF         1   \n",
+       "5                                  tell me everything   NOT         0   \n",
+       "6                                     Bitch YES [URL]   HOF         1   \n",
+       "3   you'd be blind to not see the heart eyes i hav...   NOT         0   \n",
+       "15                                          fuck them   HOF         1   \n",
+       "\n",
+       "                                                graph  \n",
+       "14                     (1, 4, 2, 3, 0, 5, 8, 6, 7, 9)  \n",
+       "2   (1, 0, 2, 3, 4, 5, 6, 12, 7, 8, 9, 10, 11, 13,...  \n",
+       "10                           (1, 4, 2, 3, 0, 5, 6, 7)  \n",
+       "7                                  (1, 2, 0, 3, 4, 5)  \n",
+       "1   (1, 3, 2, 0, 4, 5, 8, 6, 7, 9, 10, 11, 13, 12,...  \n",
+       "9                         (1, 2, 0, 3, 8, 4, 5, 6, 7)  \n",
+       "8          (1, 0, 2, 3, 4, 5, 8, 6, 7, 9, 11, 10, 12)  \n",
+       "4                  (1, 3, 2, 0, 4, 7, 5, 6, 8, 9, 10)  \n",
+       "5                                        (1, 0, 2, 3)  \n",
+       "6                                  (1, 0, 2, 3, 4, 5)  \n",
+       "3   (1, 4, 2, 3, 0, 5, 7, 6, 8, 10, 9, 11, 12, 13,...  \n",
+       "15                                          (1, 0, 2)  "
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
    "id": "d42d4556",
    "metadata": {
     "slideshow": {
@@ -1508,7 +1871,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 23,
    "id": "3a5d67d6",
    "metadata": {
     "slideshow": {
@@ -1527,7 +1890,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "12it [00:00, 225.73it/s]\n"
+      "12it [00:00, 213.54it/s]"
      ]
     },
     {
@@ -1540,6 +1903,13 @@
       "Training...\n",
       "Getting features...\n"
      ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
     }
    ],
    "source": [
@@ -1548,7 +1918,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 24,
    "id": "9f628397",
    "metadata": {},
    "outputs": [
@@ -1556,18 +1926,14 @@
      "data": {
       "text/plain": [
        "defaultdict(list,\n",
-       "            {'HOF': [(['(u_13 / fuck)'], [], 'HOF'),\n",
+       "            {'HOF': [(['(u_19 / fuck)'], [], 'HOF'),\n",
        "              (['(u_1 / be)'], [], 'HOF'),\n",
-       "              (['(u_17 / url  :punct (u_16 / LSB)  :punct (u_18 / RSB))'],\n",
-       "               [],\n",
-       "               'HOF'),\n",
-       "              (['(u_17 / url  :punct (u_16 / LSB))'], [], 'HOF'),\n",
-       "              (['(u_16 / LSB)'], [], 'HOF'),\n",
-       "              (['(u_8 / to)'], [], 'HOF'),\n",
-       "              (['(u_21 / I)'], [], 'HOF')]})"
+       "              (['(u_13 / RSB)'], [], 'HOF'),\n",
+       "              (['(u_11 / LSB)'], [], 'HOF'),\n",
+       "              (['(u_8 / to)'], [], 'HOF')]})"
       ]
      },
-     "execution_count": 22,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1578,7 +1944,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 25,
    "id": "25eb919f",
    "metadata": {},
    "outputs": [],
diff --git a/notebooks/relation_examples.ipynb b/notebooks/relation_examples.ipynb
index 3fdb662..933367f 100644
--- a/notebooks/relation_examples.ipynb
+++ b/notebooks/relation_examples.ipynb
@@ -25,7 +25,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 90,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -44,7 +44,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 122,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -77,7 +77,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 123,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -87,52 +87,35 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 124,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2021-12-03 12:22:11 WARNING: Can not find mwt: default from official model list. Ignoring it.\n",
-      "WARNING:stanza:Can not find mwt: default from official model list. Ignoring it.\n",
-      "2021-12-03 12:22:11 INFO: Loading these models for language: en (English):\n",
-      "=======================\n",
-      "| Processor | Package |\n",
-      "-----------------------\n",
-      "| tokenize  | ewt     |\n",
-      "| pos       | ewt     |\n",
-      "| lemma     | ewt     |\n",
-      "| depparse  | ewt     |\n",
-      "=======================\n",
+      "2022-02-14 12:39:47,351 : common (213) - WARNING - Can not find mwt: default from official model list. Ignoring it.\n",
+      "2022-02-14 12:39:47,353 : core (112) - INFO - Loading these models for language: en (English):\n",
+      "========================\n",
+      "| Processor | Package  |\n",
+      "------------------------\n",
+      "| tokenize  | combined |\n",
+      "| pos       | combined |\n",
+      "| lemma     | combined |\n",
+      "| depparse  | combined |\n",
+      "========================\n",
       "\n",
-      "INFO:stanza:Loading these models for language: en (English):\n",
-      "=======================\n",
-      "| Processor | Package |\n",
-      "-----------------------\n",
-      "| tokenize  | ewt     |\n",
-      "| pos       | ewt     |\n",
-      "| lemma     | ewt     |\n",
-      "| depparse  | ewt     |\n",
-      "=======================\n",
-      "\n",
-      "2021-12-03 12:22:11 INFO: Use device: cpu\n",
-      "INFO:stanza:Use device: cpu\n",
-      "2021-12-03 12:22:11 INFO: Loading: tokenize\n",
-      "INFO:stanza:Loading: tokenize\n",
-      "2021-12-03 12:22:11 INFO: Loading: pos\n",
-      "INFO:stanza:Loading: pos\n",
-      "2021-12-03 12:22:12 INFO: Loading: lemma\n",
-      "INFO:stanza:Loading: lemma\n",
-      "2021-12-03 12:22:12 INFO: Loading: depparse\n",
-      "INFO:stanza:Loading: depparse\n",
-      "2021-12-03 12:22:12 INFO: Done loading processors!\n",
-      "INFO:stanza:Done loading processors!\n",
-      "WARNING:root:loading NLP cache from en_nlp_cache...\n",
-      "WARNING:root:done!\n",
-      "WARNING:root:loading cache from file: cache/UD_FL.json\n",
-      "WARNING:root:loaded cache from cache/UD_FL.json with interpretations: ['fl', 'ud']\n",
-      "100%|██████████| 18/18 [00:00<00:00, 1322.89it/s]\n"
+      "2022-02-14 12:39:47,376 : core (123) - INFO - Use device: cpu\n",
+      "2022-02-14 12:39:47,377 : core (129) - INFO - Loading: tokenize\n",
+      "2022-02-14 12:39:47,382 : core (129) - INFO - Loading: pos\n",
+      "2022-02-14 12:39:47,537 : core (129) - INFO - Loading: lemma\n",
+      "2022-02-14 12:39:47,562 : core (129) - INFO - Loading: depparse\n",
+      "2022-02-14 12:39:47,852 : core (179) - INFO - Done loading processors!\n",
+      "2022-02-14 12:39:47,853 : pipeline (40) - INFO - loading NLP cache from en_nlp_cache...\n",
+      "2022-02-14 12:39:47,862 : pipeline (42) - INFO - done!\n",
+      "2022-02-14 12:39:47,863 : irtg (81) - INFO - loading cache from file: cache/UD_FL.json\n",
+      "2022-02-14 12:39:47,864 : irtg (21) - INFO - loaded cache from cache/UD_FL.json with interpretations: ['fl', 'ud']\n",
+      "100%|█████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 1283.86it/s]\n"
      ]
     }
    ],
@@ -143,7 +126,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 125,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -153,251 +136,167 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 130,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>text</th>\n",
-       "      <th>label</th>\n",
-       "      <th>label_id</th>\n",
-       "      <th>graph</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>Governments and industries in nations around t...</td>\n",
-       "      <td>Entity-Destination(e1,e2)</td>\n",
-       "      <td>1</td>\n",
-       "      <td>(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>The scientists poured XXX into pint YYY.</td>\n",
-       "      <td>Entity-Destination(e1,e2)</td>\n",
-       "      <td>1</td>\n",
-       "      <td>(0, 1, 11, 9, 12, 13)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>The suspect pushed the XXX into a deep YYY.</td>\n",
-       "      <td>Entity-Destination(e1,e2)</td>\n",
-       "      <td>1</td>\n",
-       "      <td>(14, 1, 15, 9, 12, 16)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>The Nepalese government sets up a XXX to inqui...</td>\n",
-       "      <td>Other</td>\n",
-       "      <td>0</td>\n",
-       "      <td>(17, 1, 4, 18, 19, 20, 9, 12, 21, 22, 23, 24)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>The entity1 to buy papers is pushed into the n...</td>\n",
-       "      <td>Entity-Destination(e1,e2)</td>\n",
-       "      <td>1</td>\n",
-       "      <td>(14, 25, 26, 27, 9, 28, 29, 30)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>An unnamed XXX was pushed into the YYY.</td>\n",
-       "      <td>Entity-Destination(e1,e2)</td>\n",
-       "      <td>1</td>\n",
-       "      <td>(14, 1, 31, 9, 12)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>Since then, numerous independent feature XXX h...</td>\n",
-       "      <td>Other</td>\n",
-       "      <td>0</td>\n",
-       "      <td>(32, 1, 33, 34, 35, 9, 12, 36, 37)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7</th>\n",
-       "      <td>For some reason, the XXX was blinded from his ...</td>\n",
-       "      <td>Other</td>\n",
-       "      <td>0</td>\n",
-       "      <td>(38, 1, 39, 40, 22, 41, 42, 12, 43, 44, 45, 46...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8</th>\n",
-       "      <td>Sparky Anderson is making progress in his XXX ...</td>\n",
-       "      <td>Other</td>\n",
-       "      <td>0</td>\n",
-       "      <td>(2, 48, 19, 49, 50, 51, 52, 53, 54, 55, 5, 1, ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9</th>\n",
-       "      <td>Olympics have already poured one XXX into the ...</td>\n",
-       "      <td>Entity-Destination(e1,e2)</td>\n",
-       "      <td>1</td>\n",
-       "      <td>(0, 1, 58, 59, 60, 9, 12)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10</th>\n",
-       "      <td>After wrapping him in a light blanket, they pl...</td>\n",
-       "      <td>Entity-Destination(e1,e2)</td>\n",
-       "      <td>1</td>\n",
-       "      <td>(61, 1, 62, 5, 12, 63, 46, 45, 64, 65, 66, 67)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>11</th>\n",
-       "      <td>I placed the XXX in a natural YYY, at the base...</td>\n",
-       "      <td>Entity-Destination(e1,e2)</td>\n",
-       "      <td>1</td>\n",
-       "      <td>(61, 1, 68, 69, 70, 22, 71, 72, 73, 5, 12, 74)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>12</th>\n",
-       "      <td>The XXX was delivered from the YYY of Lincoln ...</td>\n",
-       "      <td>Other</td>\n",
-       "      <td>0</td>\n",
-       "      <td>(75, 76, 77, 71, 22, 78, 79, 80, 81, 44, 45, 8...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>13</th>\n",
-       "      <td>The XXX leaked from every conceivable YYY.</td>\n",
-       "      <td>Other</td>\n",
-       "      <td>0</td>\n",
-       "      <td>(1, 85, 42, 12, 86)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>14</th>\n",
-       "      <td>The scientists placed the XXX in a tiny YYY wh...</td>\n",
-       "      <td>Entity-Destination(e1,e2)</td>\n",
-       "      <td>1</td>\n",
-       "      <td>(61, 1, 11, 5, 12, 2, 87, 37, 88, 89, 90, 91, ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15</th>\n",
-       "      <td>The level surface closest to the MSS, known as...</td>\n",
-       "      <td>Other</td>\n",
-       "      <td>0</td>\n",
-       "      <td>(95, 96, 5, 97, 98, 99, 19, 100, 101, 77, 1, 1...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>16</th>\n",
-       "      <td>Gaza XXX recover from three YYY of war.</td>\n",
-       "      <td>Other</td>\n",
-       "      <td>0</td>\n",
-       "      <td>(103, 104, 42, 12, 105, 22, 106)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>17</th>\n",
-       "      <td>This latest XXX from the animation YYY at Pixa...</td>\n",
-       "      <td>Other</td>\n",
-       "      <td>0</td>\n",
-       "      <td>(2, 107, 108, 75, 109, 110, 111, 112, 1, 113, ...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
+      "image/svg+xml": [
+       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
+       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
+       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
+       "<!-- Generated by graphviz version 2.43.0 (0)\n",
+       " -->\n",
+       "<!-- Title: finite_state_machine Pages: 1 -->\n",
+       "<svg width=\"438pt\" height=\"495pt\"\n",
+       " viewBox=\"0.00 0.00 450.09 509.37\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
+       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1.03 1.03) rotate(0) translate(4 505.37)\">\n",
+       "<title>finite_state_machine</title>\n",
+       "<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-505.37 446.09,-505.37 446.09,4 -4,4\"/>\n",
+       "<!-- COORD -->\n",
+       "<g id=\"node1\" class=\"node\">\n",
+       "<title>COORD</title>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"157.05\" cy=\"-231.03\" rx=\"46.29\" ry=\"46.29\"/>\n",
+       "<text text-anchor=\"middle\" x=\"157.05\" y=\"-227.33\" font-family=\"Times,serif\" font-size=\"14.00\">COORD</text>\n",
+       "</g>\n",
+       "<!-- government -->\n",
+       "<g id=\"node4\" class=\"node\">\n",
+       "<title>government</title>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"185.05\" cy=\"-66.94\" rx=\"66.89\" ry=\"66.89\"/>\n",
+       "<text text-anchor=\"middle\" x=\"185.05\" y=\"-63.24\" font-family=\"Times,serif\" font-size=\"14.00\">government</text>\n",
+       "</g>\n",
+       "<!-- COORD&#45;&gt;government -->\n",
+       "<g id=\"edge1\" class=\"edge\">\n",
+       "<title>COORD&#45;&gt;government</title>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M164.77,-185.31C167.03,-172.22 169.57,-157.53 172.07,-143.09\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"175.53,-143.6 173.78,-133.15 168.63,-142.41 175.53,-143.6\"/>\n",
+       "<text text-anchor=\"middle\" x=\"176.05\" y=\"-155.69\" font-family=\"Times,serif\" font-size=\"14.00\">0</text>\n",
+       "</g>\n",
+       "<!-- industry -->\n",
+       "<g id=\"node6\" class=\"node\">\n",
+       "<title>industry</title>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"50.05\" cy=\"-66.94\" rx=\"50.09\" ry=\"50.09\"/>\n",
+       "<text text-anchor=\"middle\" x=\"50.05\" y=\"-63.24\" font-family=\"Times,serif\" font-size=\"14.00\">industry</text>\n",
+       "</g>\n",
+       "<!-- COORD&#45;&gt;industry -->\n",
+       "<g id=\"edge2\" class=\"edge\">\n",
+       "<title>COORD&#45;&gt;industry</title>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M131.96,-192.03C117.33,-169.86 98.7,-141.64 82.9,-117.72\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"85.65,-115.53 77.22,-109.12 79.81,-119.39 85.65,-115.53\"/>\n",
+       "<text text-anchor=\"middle\" x=\"119.05\" y=\"-155.69\" font-family=\"Times,serif\" font-size=\"14.00\">0</text>\n",
+       "</g>\n",
+       "<!-- YYY -->\n",
+       "<g id=\"node2\" class=\"node\">\n",
+       "<title>YYY</title>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"362.05\" cy=\"-360.68\" rx=\"28.7\" ry=\"28.7\"/>\n",
+       "<text text-anchor=\"middle\" x=\"362.05\" y=\"-356.98\" font-family=\"Times,serif\" font-size=\"14.00\">YYY</text>\n",
+       "</g>\n",
+       "<!-- around -->\n",
+       "<g id=\"node3\" class=\"node\">\n",
+       "<title>around</title>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"322.05\" cy=\"-231.03\" rx=\"43.59\" ry=\"43.59\"/>\n",
+       "<text text-anchor=\"middle\" x=\"322.05\" y=\"-227.33\" font-family=\"Times,serif\" font-size=\"14.00\">around</text>\n",
+       "</g>\n",
+       "<!-- nation -->\n",
+       "<g id=\"node8\" class=\"node\">\n",
+       "<title>nation</title>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"310.05\" cy=\"-66.94\" rx=\"40.09\" ry=\"40.09\"/>\n",
+       "<text text-anchor=\"middle\" x=\"310.05\" y=\"-63.24\" font-family=\"Times,serif\" font-size=\"14.00\">nation</text>\n",
+       "</g>\n",
+       "<!-- around&#45;&gt;nation -->\n",
+       "<g id=\"edge3\" class=\"edge\">\n",
+       "<title>around&#45;&gt;nation</title>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M318.89,-187.45C317.3,-165.89 315.35,-139.61 313.69,-117.19\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"317.18,-116.91 312.95,-107.19 310.2,-117.42 317.18,-116.91\"/>\n",
+       "<text text-anchor=\"middle\" x=\"323.05\" y=\"-155.69\" font-family=\"Times,serif\" font-size=\"14.00\">1</text>\n",
+       "</g>\n",
+       "<!-- world -->\n",
+       "<g id=\"node10\" class=\"node\">\n",
+       "<title>world</title>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"405.05\" cy=\"-66.94\" rx=\"37.09\" ry=\"37.09\"/>\n",
+       "<text text-anchor=\"middle\" x=\"405.05\" y=\"-63.24\" font-family=\"Times,serif\" font-size=\"14.00\">world</text>\n",
+       "</g>\n",
+       "<!-- around&#45;&gt;world -->\n",
+       "<g id=\"edge4\" class=\"edge\">\n",
+       "<title>around&#45;&gt;world</title>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M341.51,-192.03C354.22,-167.2 370.83,-134.77 383.88,-109.29\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"387.06,-110.74 388.51,-100.24 380.83,-107.55 387.06,-110.74\"/>\n",
+       "<text text-anchor=\"middle\" x=\"366.05\" y=\"-155.69\" font-family=\"Times,serif\" font-size=\"14.00\">2</text>\n",
+       "</g>\n",
+       "<!-- in -->\n",
+       "<g id=\"node5\" class=\"node\">\n",
+       "<title>in</title>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"241.05\" cy=\"-231.03\" rx=\"19.5\" ry=\"19.5\"/>\n",
+       "<text text-anchor=\"middle\" x=\"241.05\" y=\"-227.33\" font-family=\"Times,serif\" font-size=\"14.00\">in</text>\n",
+       "</g>\n",
+       "<!-- in&#45;&gt;government -->\n",
+       "<g id=\"edge5\" class=\"edge\">\n",
+       "<title>in&#45;&gt;government</title>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M234.95,-212.38C228.91,-194.91 219.27,-167.01 210.06,-140.36\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"213.27,-138.93 206.7,-130.62 206.66,-141.21 213.27,-138.93\"/>\n",
+       "<text text-anchor=\"middle\" x=\"224.05\" y=\"-155.69\" font-family=\"Times,serif\" font-size=\"14.00\">1</text>\n",
+       "</g>\n",
+       "<!-- in&#45;&gt;nation -->\n",
+       "<g id=\"edge6\" class=\"edge\">\n",
+       "<title>in&#45;&gt;nation</title>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M248.43,-212.69C258.4,-189.27 276.52,-146.7 290.54,-113.76\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"293.87,-114.89 294.56,-104.31 287.43,-112.14 293.87,-114.89\"/>\n",
+       "<text text-anchor=\"middle\" x=\"279.05\" y=\"-155.69\" font-family=\"Times,serif\" font-size=\"14.00\">2</text>\n",
+       "</g>\n",
+       "<!-- into -->\n",
+       "<g id=\"node7\" class=\"node\">\n",
+       "<title>into</title>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"322.05\" cy=\"-472.77\" rx=\"28.7\" ry=\"28.7\"/>\n",
+       "<text text-anchor=\"middle\" x=\"322.05\" y=\"-469.07\" font-family=\"Times,serif\" font-size=\"14.00\">into</text>\n",
+       "</g>\n",
+       "<!-- into&#45;&gt;YYY -->\n",
+       "<g id=\"edge7\" class=\"edge\">\n",
+       "<title>into&#45;&gt;YYY</title>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M331.53,-445.68C336.77,-431.25 343.36,-413.11 349.07,-397.39\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"352.37,-398.56 352.5,-387.96 345.79,-396.17 352.37,-398.56\"/>\n",
+       "<text text-anchor=\"middle\" x=\"348.05\" y=\"-414.97\" font-family=\"Times,serif\" font-size=\"14.00\">2</text>\n",
+       "</g>\n",
+       "<!-- pour -->\n",
+       "<g id=\"node9\" class=\"node\">\n",
+       "<title>pour</title>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"283.05\" cy=\"-360.68\" rx=\"32.49\" ry=\"32.49\"/>\n",
+       "<text text-anchor=\"middle\" x=\"283.05\" y=\"-356.98\" font-family=\"Times,serif\" font-size=\"14.00\">pour</text>\n",
+       "</g>\n",
+       "<!-- into&#45;&gt;pour -->\n",
+       "<g id=\"edge8\" class=\"edge\">\n",
+       "<title>into&#45;&gt;pour</title>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M312.8,-445.68C308.07,-432.33 302.22,-415.8 296.95,-400.94\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"300.24,-399.73 293.6,-391.47 293.64,-402.06 300.24,-399.73\"/>\n",
+       "<text text-anchor=\"middle\" x=\"310.05\" y=\"-414.97\" font-family=\"Times,serif\" font-size=\"14.00\">1</text>\n",
+       "</g>\n",
+       "<!-- pour&#45;&gt;COORD -->\n",
+       "<g id=\"edge9\" class=\"edge\">\n",
+       "<title>pour&#45;&gt;COORD</title>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M260.52,-336.86C242.79,-318.9 217.64,-293.42 196.58,-272.08\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"198.84,-269.39 189.32,-264.73 193.85,-274.3 198.84,-269.39\"/>\n",
+       "<text text-anchor=\"middle\" x=\"237.05\" y=\"-298.98\" font-family=\"Times,serif\" font-size=\"14.00\">1</text>\n",
+       "</g>\n",
+       "<!-- xxx -->\n",
+       "<g id=\"node11\" class=\"node\">\n",
+       "<title>xxx</title>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"410.05\" cy=\"-231.03\" rx=\"26.8\" ry=\"26.8\"/>\n",
+       "<text text-anchor=\"middle\" x=\"410.05\" y=\"-227.33\" font-family=\"Times,serif\" font-size=\"14.00\">xxx</text>\n",
+       "</g>\n",
+       "<!-- pour&#45;&gt;xxx -->\n",
+       "<g id=\"edge10\" class=\"edge\">\n",
+       "<title>pour&#45;&gt;xxx</title>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M307.98,-339.44C327.15,-323.39 353.85,-299.95 375.05,-277.18 379.57,-272.32 384.12,-266.92 388.37,-261.58\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"391.4,-263.4 394.77,-253.35 385.87,-259.1 391.4,-263.4\"/>\n",
+       "<text text-anchor=\"middle\" x=\"361.05\" y=\"-298.98\" font-family=\"Times,serif\" font-size=\"14.00\">2</text>\n",
+       "</g>\n",
+       "</g>\n",
+       "</svg>\n"
       ],
       "text/plain": [
-       "                                                 text  \\\n",
-       "0   Governments and industries in nations around t...   \n",
-       "1            The scientists poured XXX into pint YYY.   \n",
-       "2         The suspect pushed the XXX into a deep YYY.   \n",
-       "3   The Nepalese government sets up a XXX to inqui...   \n",
-       "4   The entity1 to buy papers is pushed into the n...   \n",
-       "5             An unnamed XXX was pushed into the YYY.   \n",
-       "6   Since then, numerous independent feature XXX h...   \n",
-       "7   For some reason, the XXX was blinded from his ...   \n",
-       "8   Sparky Anderson is making progress in his XXX ...   \n",
-       "9   Olympics have already poured one XXX into the ...   \n",
-       "10  After wrapping him in a light blanket, they pl...   \n",
-       "11  I placed the XXX in a natural YYY, at the base...   \n",
-       "12  The XXX was delivered from the YYY of Lincoln ...   \n",
-       "13         The XXX leaked from every conceivable YYY.   \n",
-       "14  The scientists placed the XXX in a tiny YYY wh...   \n",
-       "15  The level surface closest to the MSS, known as...   \n",
-       "16            Gaza XXX recover from three YYY of war.   \n",
-       "17  This latest XXX from the animation YYY at Pixa...   \n",
-       "\n",
-       "                        label  label_id  \\\n",
-       "0   Entity-Destination(e1,e2)         1   \n",
-       "1   Entity-Destination(e1,e2)         1   \n",
-       "2   Entity-Destination(e1,e2)         1   \n",
-       "3                       Other         0   \n",
-       "4   Entity-Destination(e1,e2)         1   \n",
-       "5   Entity-Destination(e1,e2)         1   \n",
-       "6                       Other         0   \n",
-       "7                       Other         0   \n",
-       "8                       Other         0   \n",
-       "9   Entity-Destination(e1,e2)         1   \n",
-       "10  Entity-Destination(e1,e2)         1   \n",
-       "11  Entity-Destination(e1,e2)         1   \n",
-       "12                      Other         0   \n",
-       "13                      Other         0   \n",
-       "14  Entity-Destination(e1,e2)         1   \n",
-       "15                      Other         0   \n",
-       "16                      Other         0   \n",
-       "17                      Other         0   \n",
-       "\n",
-       "                                                graph  \n",
-       "0                  (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)  \n",
-       "1                               (0, 1, 11, 9, 12, 13)  \n",
-       "2                              (14, 1, 15, 9, 12, 16)  \n",
-       "3       (17, 1, 4, 18, 19, 20, 9, 12, 21, 22, 23, 24)  \n",
-       "4                     (14, 25, 26, 27, 9, 28, 29, 30)  \n",
-       "5                                  (14, 1, 31, 9, 12)  \n",
-       "6                  (32, 1, 33, 34, 35, 9, 12, 36, 37)  \n",
-       "7   (38, 1, 39, 40, 22, 41, 42, 12, 43, 44, 45, 46...  \n",
-       "8   (2, 48, 19, 49, 50, 51, 52, 53, 54, 55, 5, 1, ...  \n",
-       "9                           (0, 1, 58, 59, 60, 9, 12)  \n",
-       "10     (61, 1, 62, 5, 12, 63, 46, 45, 64, 65, 66, 67)  \n",
-       "11     (61, 1, 68, 69, 70, 22, 71, 72, 73, 5, 12, 74)  \n",
-       "12  (75, 76, 77, 71, 22, 78, 79, 80, 81, 44, 45, 8...  \n",
-       "13                                (1, 85, 42, 12, 86)  \n",
-       "14  (61, 1, 11, 5, 12, 2, 87, 37, 88, 89, 90, 91, ...  \n",
-       "15  (95, 96, 5, 97, 98, 99, 19, 100, 101, 77, 1, 1...  \n",
-       "16                   (103, 104, 42, 12, 105, 22, 106)  \n",
-       "17  (2, 107, 108, 75, 109, 110, 111, 112, 1, 113, ...  "
+       "<graphviz.sources.Source at 0x7f417a195430>"
       ]
      },
-     "execution_count": 130,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 103,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/svg+xml": "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<!-- Generated by graphviz version 2.43.0 (0)\n -->\n<!-- Title: finite_state_machine Pages: 1 -->\n<svg width=\"438pt\" height=\"495pt\"\n viewBox=\"0.00 0.00 450.09 509.37\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<g id=\"graph0\" class=\"graph\" transform=\"scale(1.03 1.03) rotate(0) translate(4 505.37)\">\n<title>finite_state_machine</title>\n<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-505.37 446.09,-505.37 446.09,4 -4,4\"/>\n<!-- COORD -->\n<g id=\"node1\" class=\"node\">\n<title>COORD</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"157.05\" cy=\"-231.03\" rx=\"46.29\" ry=\"46.29\"/>\n<text text-anchor=\"middle\" x=\"157.05\" y=\"-227.33\" font-family=\"Times,serif\" font-size=\"14.00\">COORD</text>\n</g>\n<!-- government -->\n<g id=\"node4\" class=\"node\">\n<title>government</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"185.05\" cy=\"-66.94\" rx=\"66.89\" ry=\"66.89\"/>\n<text text-anchor=\"middle\" x=\"185.05\" y=\"-63.24\" font-family=\"Times,serif\" font-size=\"14.00\">government</text>\n</g>\n<!-- COORD&#45;&gt;government -->\n<g id=\"edge1\" class=\"edge\">\n<title>COORD&#45;&gt;government</title>\n<path fill=\"none\" stroke=\"black\" d=\"M164.77,-185.31C167.03,-172.22 169.57,-157.53 172.07,-143.09\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"175.53,-143.6 173.78,-133.15 168.63,-142.41 175.53,-143.6\"/>\n<text text-anchor=\"middle\" x=\"176.05\" y=\"-155.69\" font-family=\"Times,serif\" font-size=\"14.00\">0</text>\n</g>\n<!-- industry -->\n<g id=\"node6\" class=\"node\">\n<title>industry</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"50.05\" cy=\"-66.94\" rx=\"50.09\" ry=\"50.09\"/>\n<text text-anchor=\"middle\" x=\"50.05\" y=\"-63.24\" font-family=\"Times,serif\" font-size=\"14.00\">industry</text>\n</g>\n<!-- COORD&#45;&gt;industry -->\n<g id=\"edge2\" class=\"edge\">\n<title>COORD&#45;&gt;industry</title>\n<path fill=\"none\" stroke=\"black\" d=\"M131.96,-192.03C117.33,-169.86 98.7,-141.64 82.9,-117.72\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"85.65,-115.53 77.22,-109.12 79.81,-119.39 85.65,-115.53\"/>\n<text text-anchor=\"middle\" x=\"119.05\" y=\"-155.69\" font-family=\"Times,serif\" font-size=\"14.00\">0</text>\n</g>\n<!-- YYY -->\n<g id=\"node2\" class=\"node\">\n<title>YYY</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"362.05\" cy=\"-360.68\" rx=\"28.7\" ry=\"28.7\"/>\n<text text-anchor=\"middle\" x=\"362.05\" y=\"-356.98\" font-family=\"Times,serif\" font-size=\"14.00\">YYY</text>\n</g>\n<!-- around -->\n<g id=\"node3\" class=\"node\">\n<title>around</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"322.05\" cy=\"-231.03\" rx=\"43.59\" ry=\"43.59\"/>\n<text text-anchor=\"middle\" x=\"322.05\" y=\"-227.33\" font-family=\"Times,serif\" font-size=\"14.00\">around</text>\n</g>\n<!-- nation -->\n<g id=\"node8\" class=\"node\">\n<title>nation</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"310.05\" cy=\"-66.94\" rx=\"40.09\" ry=\"40.09\"/>\n<text text-anchor=\"middle\" x=\"310.05\" y=\"-63.24\" font-family=\"Times,serif\" font-size=\"14.00\">nation</text>\n</g>\n<!-- around&#45;&gt;nation -->\n<g id=\"edge3\" class=\"edge\">\n<title>around&#45;&gt;nation</title>\n<path fill=\"none\" stroke=\"black\" d=\"M318.89,-187.45C317.3,-165.89 315.35,-139.61 313.69,-117.19\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"317.18,-116.91 312.95,-107.19 310.2,-117.42 317.18,-116.91\"/>\n<text text-anchor=\"middle\" x=\"323.05\" y=\"-155.69\" font-family=\"Times,serif\" font-size=\"14.00\">1</text>\n</g>\n<!-- world -->\n<g id=\"node10\" class=\"node\">\n<title>world</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"405.05\" cy=\"-66.94\" rx=\"37.09\" ry=\"37.09\"/>\n<text text-anchor=\"middle\" x=\"405.05\" y=\"-63.24\" font-family=\"Times,serif\" font-size=\"14.00\">world</text>\n</g>\n<!-- around&#45;&gt;world -->\n<g id=\"edge4\" class=\"edge\">\n<title>around&#45;&gt;world</title>\n<path fill=\"none\" stroke=\"black\" d=\"M341.51,-192.03C354.22,-167.2 370.83,-134.77 383.88,-109.29\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"387.06,-110.74 388.51,-100.24 380.83,-107.55 387.06,-110.74\"/>\n<text text-anchor=\"middle\" x=\"366.05\" y=\"-155.69\" font-family=\"Times,serif\" font-size=\"14.00\">2</text>\n</g>\n<!-- in -->\n<g id=\"node5\" class=\"node\">\n<title>in</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"241.05\" cy=\"-231.03\" rx=\"19.5\" ry=\"19.5\"/>\n<text text-anchor=\"middle\" x=\"241.05\" y=\"-227.33\" font-family=\"Times,serif\" font-size=\"14.00\">in</text>\n</g>\n<!-- in&#45;&gt;government -->\n<g id=\"edge5\" class=\"edge\">\n<title>in&#45;&gt;government</title>\n<path fill=\"none\" stroke=\"black\" d=\"M234.95,-212.38C228.91,-194.91 219.27,-167.01 210.06,-140.36\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"213.27,-138.93 206.7,-130.62 206.66,-141.21 213.27,-138.93\"/>\n<text text-anchor=\"middle\" x=\"224.05\" y=\"-155.69\" font-family=\"Times,serif\" font-size=\"14.00\">1</text>\n</g>\n<!-- in&#45;&gt;nation -->\n<g id=\"edge6\" class=\"edge\">\n<title>in&#45;&gt;nation</title>\n<path fill=\"none\" stroke=\"black\" d=\"M248.43,-212.69C258.4,-189.27 276.52,-146.7 290.54,-113.76\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"293.87,-114.89 294.56,-104.31 287.43,-112.14 293.87,-114.89\"/>\n<text text-anchor=\"middle\" x=\"279.05\" y=\"-155.69\" font-family=\"Times,serif\" font-size=\"14.00\">2</text>\n</g>\n<!-- into -->\n<g id=\"node7\" class=\"node\">\n<title>into</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"322.05\" cy=\"-472.77\" rx=\"28.7\" ry=\"28.7\"/>\n<text text-anchor=\"middle\" x=\"322.05\" y=\"-469.07\" font-family=\"Times,serif\" font-size=\"14.00\">into</text>\n</g>\n<!-- into&#45;&gt;YYY -->\n<g id=\"edge7\" class=\"edge\">\n<title>into&#45;&gt;YYY</title>\n<path fill=\"none\" stroke=\"black\" d=\"M331.53,-445.68C336.77,-431.25 343.36,-413.11 349.07,-397.39\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"352.37,-398.56 352.5,-387.96 345.79,-396.17 352.37,-398.56\"/>\n<text text-anchor=\"middle\" x=\"348.05\" y=\"-414.97\" font-family=\"Times,serif\" font-size=\"14.00\">2</text>\n</g>\n<!-- pour -->\n<g id=\"node9\" class=\"node\">\n<title>pour</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"283.05\" cy=\"-360.68\" rx=\"32.49\" ry=\"32.49\"/>\n<text text-anchor=\"middle\" x=\"283.05\" y=\"-356.98\" font-family=\"Times,serif\" font-size=\"14.00\">pour</text>\n</g>\n<!-- into&#45;&gt;pour -->\n<g id=\"edge8\" class=\"edge\">\n<title>into&#45;&gt;pour</title>\n<path fill=\"none\" stroke=\"black\" d=\"M312.8,-445.68C308.07,-432.33 302.22,-415.8 296.95,-400.94\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"300.24,-399.73 293.6,-391.47 293.64,-402.06 300.24,-399.73\"/>\n<text text-anchor=\"middle\" x=\"310.05\" y=\"-414.97\" font-family=\"Times,serif\" font-size=\"14.00\">1</text>\n</g>\n<!-- pour&#45;&gt;COORD -->\n<g id=\"edge9\" class=\"edge\">\n<title>pour&#45;&gt;COORD</title>\n<path fill=\"none\" stroke=\"black\" d=\"M260.52,-336.86C242.79,-318.9 217.64,-293.42 196.58,-272.08\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"198.84,-269.39 189.32,-264.73 193.85,-274.3 198.84,-269.39\"/>\n<text text-anchor=\"middle\" x=\"237.05\" y=\"-298.98\" font-family=\"Times,serif\" font-size=\"14.00\">1</text>\n</g>\n<!-- xxx -->\n<g id=\"node11\" class=\"node\">\n<title>xxx</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"410.05\" cy=\"-231.03\" rx=\"26.8\" ry=\"26.8\"/>\n<text text-anchor=\"middle\" x=\"410.05\" y=\"-227.33\" font-family=\"Times,serif\" font-size=\"14.00\">xxx</text>\n</g>\n<!-- pour&#45;&gt;xxx -->\n<g id=\"edge10\" class=\"edge\">\n<title>pour&#45;&gt;xxx</title>\n<path fill=\"none\" stroke=\"black\" d=\"M307.98,-339.44C327.15,-323.39 353.85,-299.95 375.05,-277.18 379.57,-272.32 384.12,-266.92 388.37,-261.58\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"391.4,-263.4 394.77,-253.35 385.87,-259.1 391.4,-263.4\"/>\n<text text-anchor=\"middle\" x=\"361.05\" y=\"-298.98\" font-family=\"Times,serif\" font-size=\"14.00\">2</text>\n</g>\n</g>\n</svg>\n",
-      "text/plain": [
-       "<graphviz.sources.Source at 0x7fa50858a370>"
-      ]
-     },
-     "execution_count": 103,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -407,7 +306,7 @@
     "from xpotato.models.utils import to_dot\n",
     "from graphviz import Source\n",
     "\n",
-    "Source(to_dot(dataset.graphs[0]))"
+    "Source(to_dot(df.iloc[0].graph))"
    ]
   },
   {
@@ -429,7 +328,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -446,7 +345,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 99,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -463,14 +362,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 134,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "18it [00:00, 5378.85it/s]\n"
+      "18it [00:00, 6470.47it/s]\n"
      ]
     },
     {
@@ -654,7 +553,7 @@
        "17                                                                              "
       ]
      },
-     "execution_count": 134,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -689,14 +588,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 139,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "18it [00:00, 4256.50it/s]\n"
+      "18it [00:00, 4578.38it/s]\n"
      ]
     },
     {
@@ -900,7 +799,7 @@
        "17                                                     "
       ]
      },
-     "execution_count": 139,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -919,14 +818,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 142,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "18it [00:00, 2162.69it/s]\n"
+      "18it [00:00, 4810.90it/s]\n"
      ]
     },
     {
@@ -1130,7 +1029,7 @@
        "17                                                     "
       ]
      },
-     "execution_count": 142,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1149,14 +1048,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 145,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "18it [00:00, 3771.48it/s]\n"
+      "18it [00:00, 6602.31it/s]\n"
      ]
     },
     {
@@ -1360,7 +1259,7 @@
        "17                                                     "
       ]
      },
-     "execution_count": 145,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1380,16 +1279,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 107,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "['(u_1 / into :1 (u_2 / push|pour) :2 (u_3 / YYY))']"
+       "['(u_1 / into :1 (u_2 / pour|push) :2 (u_3 / YYY))']"
       ]
      },
-     "execution_count": 107,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1415,7 +1314,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1426,17 +1325,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train.to_pickle(\"train_dataset\")\n",
-    "val.to_pickle(\"val_dataset\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
@@ -1453,7 +1342,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -1467,7 +1356,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "14it [00:00, 350.24it/s]"
+      "14it [00:00, 279.91it/s]"
      ]
     },
     {
@@ -1495,31 +1384,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "defaultdict(list,\n",
-       "            {'Entity-Destination(e1,e2)': [(['(u_15 / into  :1 (u_26 / push))'],\n",
-       "               [],\n",
-       "               'Entity-Destination(e1,e2)'),\n",
-       "              (['(u_15 / into  :1 (u_19 / pour  :2 (u_0 / xxx)))'],\n",
+       "            {'Entity-Destination(e1,e2)': [(['(u_15 / into  :1 (u_19 / pour  :2 (u_1 / xxx)))'],\n",
        "               [],\n",
        "               'Entity-Destination(e1,e2)'),\n",
        "              (['(u_15 / into  :1 (u_19 / pour))'],\n",
        "               [],\n",
        "               'Entity-Destination(e1,e2)'),\n",
-       "              (['(u_19 / pour  :2 (u_0 / xxx))'],\n",
-       "               [],\n",
-       "               'Entity-Destination(e1,e2)'),\n",
-       "              (['(u_15 / into  :2 (u_3 / yyy))'],\n",
+       "              (['(u_19 / pour  :2 (u_1 / xxx))'],\n",
        "               [],\n",
        "               'Entity-Destination(e1,e2)')]})"
       ]
      },
-     "execution_count": 44,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1530,7 +1413,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1539,7 +1422,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
@@ -1570,8 +1453,10 @@
        "      <th>Support</th>\n",
        "      <th>False_positive_graphs</th>\n",
        "      <th>False_positive_sens</th>\n",
+       "      <th>False_positive_indices</th>\n",
        "      <th>True_positive_graphs</th>\n",
        "      <th>True_positive_sens</th>\n",
+       "      <th>True_positive_indices</th>\n",
        "      <th>False_negative_graphs</th>\n",
        "      <th>False_negative_sens</th>\n",
        "      <th>False_negative_indices</th>\n",
@@ -1581,147 +1466,100 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>[(u_15 / into  :1 (u_26 / push))]</td>\n",
-       "      <td>1.000000</td>\n",
+       "      <td>[(u_15 / into  :1 (u_19 / pour  :2 (u_1 / xxx)))]</td>\n",
+       "      <td>1.0</td>\n",
        "      <td>0.428571</td>\n",
-       "      <td>0.600000</td>\n",
+       "      <td>0.6</td>\n",
        "      <td>7</td>\n",
        "      <td>[]</td>\n",
        "      <td>[]</td>\n",
-       "      <td>[(14, 1, 15, 9, 12, 16), (14, 25, 26, 27, 9, 2...</td>\n",
-       "      <td>[(The suspect pushed the XXX into a deep YYY.,...</td>\n",
-       "      <td>[(61, 1, 11, 5, 12, 2, 87, 37, 88, 89, 90, 91,...</td>\n",
-       "      <td>[(The scientists placed the XXX in a tiny YYY ...</td>\n",
-       "      <td>[1]</td>\n",
-       "      <td>[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>[(u_15 / into  :1 (u_19 / pour  :2 (u_0 / xxx)))]</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>0.428571</td>\n",
-       "      <td>0.600000</td>\n",
-       "      <td>7</td>\n",
-       "      <td>[]</td>\n",
        "      <td>[]</td>\n",
        "      <td>[(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), (0, 1, 11...</td>\n",
        "      <td>[(Governments and industries in nations around...</td>\n",
-       "      <td>[(61, 1, 11, 5, 12, 2, 87, 37, 88, 89, 90, 91,...</td>\n",
+       "      <td>[2, 6, 7]</td>\n",
+       "      <td>[(2, 86, 35, 87, 88, 89, 61, 1, 11, 5, 12, 90,...</td>\n",
        "      <td>[(The scientists placed the XXX in a tiny YYY ...</td>\n",
-       "      <td>[1]</td>\n",
+       "      <td>[1, 3, 9, 11]</td>\n",
        "      <td>[0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2</th>\n",
+       "      <th>1</th>\n",
        "      <td>[(u_15 / into  :1 (u_19 / pour))]</td>\n",
-       "      <td>1.000000</td>\n",
+       "      <td>1.0</td>\n",
        "      <td>0.428571</td>\n",
-       "      <td>0.600000</td>\n",
+       "      <td>0.6</td>\n",
        "      <td>7</td>\n",
        "      <td>[]</td>\n",
        "      <td>[]</td>\n",
+       "      <td>[]</td>\n",
        "      <td>[(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), (0, 1, 11...</td>\n",
        "      <td>[(Governments and industries in nations around...</td>\n",
-       "      <td>[(61, 1, 11, 5, 12, 2, 87, 37, 88, 89, 90, 91,...</td>\n",
+       "      <td>[2, 6, 7]</td>\n",
+       "      <td>[(2, 86, 35, 87, 88, 89, 61, 1, 11, 5, 12, 90,...</td>\n",
        "      <td>[(The scientists placed the XXX in a tiny YYY ...</td>\n",
-       "      <td>[1]</td>\n",
+       "      <td>[1, 3, 9, 11]</td>\n",
        "      <td>[0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>[(u_19 / pour  :2 (u_0 / xxx))]</td>\n",
-       "      <td>1.000000</td>\n",
+       "      <th>2</th>\n",
+       "      <td>[(u_19 / pour  :2 (u_1 / xxx))]</td>\n",
+       "      <td>1.0</td>\n",
        "      <td>0.428571</td>\n",
-       "      <td>0.600000</td>\n",
+       "      <td>0.6</td>\n",
        "      <td>7</td>\n",
        "      <td>[]</td>\n",
        "      <td>[]</td>\n",
+       "      <td>[]</td>\n",
        "      <td>[(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), (0, 1, 11...</td>\n",
        "      <td>[(Governments and industries in nations around...</td>\n",
-       "      <td>[(61, 1, 11, 5, 12, 2, 87, 37, 88, 89, 90, 91,...</td>\n",
+       "      <td>[2, 6, 7]</td>\n",
+       "      <td>[(2, 86, 35, 87, 88, 89, 61, 1, 11, 5, 12, 90,...</td>\n",
        "      <td>[(The scientists placed the XXX in a tiny YYY ...</td>\n",
-       "      <td>[1]</td>\n",
+       "      <td>[1, 3, 9, 11]</td>\n",
        "      <td>[0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]</td>\n",
        "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>[(u_15 / into  :2 (u_3 / yyy))]</td>\n",
-       "      <td>0.833333</td>\n",
-       "      <td>0.714286</td>\n",
-       "      <td>0.769231</td>\n",
-       "      <td>7</td>\n",
-       "      <td>[(32, 1, 33, 34, 35, 9, 12, 36, 37)]</td>\n",
-       "      <td>[(Since then, numerous independent feature XXX...</td>\n",
-       "      <td>[(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), (14, 1, 1...</td>\n",
-       "      <td>[(Governments and industries in nations around...</td>\n",
-       "      <td>[(61, 1, 11, 5, 12, 2, 87, 37, 88, 89, 90, 91,...</td>\n",
-       "      <td>[(The scientists placed the XXX in a tiny YYY ...</td>\n",
-       "      <td>[1]</td>\n",
-       "      <td>[0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0]</td>\n",
-       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
        "                                             Feature  Precision    Recall  \\\n",
-       "0                  [(u_15 / into  :1 (u_26 / push))]   1.000000  0.428571   \n",
-       "1  [(u_15 / into  :1 (u_19 / pour  :2 (u_0 / xxx)))]   1.000000  0.428571   \n",
-       "2                  [(u_15 / into  :1 (u_19 / pour))]   1.000000  0.428571   \n",
-       "3                    [(u_19 / pour  :2 (u_0 / xxx))]   1.000000  0.428571   \n",
-       "4                    [(u_15 / into  :2 (u_3 / yyy))]   0.833333  0.714286   \n",
-       "\n",
-       "     Fscore  Support                 False_positive_graphs  \\\n",
-       "0  0.600000        7                                    []   \n",
-       "1  0.600000        7                                    []   \n",
-       "2  0.600000        7                                    []   \n",
-       "3  0.600000        7                                    []   \n",
-       "4  0.769231        7  [(32, 1, 33, 34, 35, 9, 12, 36, 37)]   \n",
+       "0  [(u_15 / into  :1 (u_19 / pour  :2 (u_1 / xxx)))]        1.0  0.428571   \n",
+       "1                  [(u_15 / into  :1 (u_19 / pour))]        1.0  0.428571   \n",
+       "2                    [(u_19 / pour  :2 (u_1 / xxx))]        1.0  0.428571   \n",
        "\n",
-       "                                 False_positive_sens  \\\n",
-       "0                                                 []   \n",
-       "1                                                 []   \n",
-       "2                                                 []   \n",
-       "3                                                 []   \n",
-       "4  [(Since then, numerous independent feature XXX...   \n",
+       "   Fscore  Support False_positive_graphs False_positive_sens  \\\n",
+       "0     0.6        7                    []                  []   \n",
+       "1     0.6        7                    []                  []   \n",
+       "2     0.6        7                    []                  []   \n",
        "\n",
-       "                                True_positive_graphs  \\\n",
-       "0  [(14, 1, 15, 9, 12, 16), (14, 25, 26, 27, 9, 2...   \n",
-       "1  [(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), (0, 1, 11...   \n",
-       "2  [(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), (0, 1, 11...   \n",
-       "3  [(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), (0, 1, 11...   \n",
-       "4  [(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), (14, 1, 1...   \n",
+       "  False_positive_indices                               True_positive_graphs  \\\n",
+       "0                     []  [(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), (0, 1, 11...   \n",
+       "1                     []  [(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), (0, 1, 11...   \n",
+       "2                     []  [(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), (0, 1, 11...   \n",
        "\n",
-       "                                  True_positive_sens  \\\n",
-       "0  [(The suspect pushed the XXX into a deep YYY.,...   \n",
-       "1  [(Governments and industries in nations around...   \n",
-       "2  [(Governments and industries in nations around...   \n",
-       "3  [(Governments and industries in nations around...   \n",
-       "4  [(Governments and industries in nations around...   \n",
+       "                                  True_positive_sens True_positive_indices  \\\n",
+       "0  [(Governments and industries in nations around...             [2, 6, 7]   \n",
+       "1  [(Governments and industries in nations around...             [2, 6, 7]   \n",
+       "2  [(Governments and industries in nations around...             [2, 6, 7]   \n",
        "\n",
        "                               False_negative_graphs  \\\n",
-       "0  [(61, 1, 11, 5, 12, 2, 87, 37, 88, 89, 90, 91,...   \n",
-       "1  [(61, 1, 11, 5, 12, 2, 87, 37, 88, 89, 90, 91,...   \n",
-       "2  [(61, 1, 11, 5, 12, 2, 87, 37, 88, 89, 90, 91,...   \n",
-       "3  [(61, 1, 11, 5, 12, 2, 87, 37, 88, 89, 90, 91,...   \n",
-       "4  [(61, 1, 11, 5, 12, 2, 87, 37, 88, 89, 90, 91,...   \n",
+       "0  [(2, 86, 35, 87, 88, 89, 61, 1, 11, 5, 12, 90,...   \n",
+       "1  [(2, 86, 35, 87, 88, 89, 61, 1, 11, 5, 12, 90,...   \n",
+       "2  [(2, 86, 35, 87, 88, 89, 61, 1, 11, 5, 12, 90,...   \n",
        "\n",
        "                                 False_negative_sens False_negative_indices  \\\n",
-       "0  [(The scientists placed the XXX in a tiny YYY ...                    [1]   \n",
-       "1  [(The scientists placed the XXX in a tiny YYY ...                    [1]   \n",
-       "2  [(The scientists placed the XXX in a tiny YYY ...                    [1]   \n",
-       "3  [(The scientists placed the XXX in a tiny YYY ...                    [1]   \n",
-       "4  [(The scientists placed the XXX in a tiny YYY ...                    [1]   \n",
+       "0  [(The scientists placed the XXX in a tiny YYY ...          [1, 3, 9, 11]   \n",
+       "1  [(The scientists placed the XXX in a tiny YYY ...          [1, 3, 9, 11]   \n",
+       "2  [(The scientists placed the XXX in a tiny YYY ...          [1, 3, 9, 11]   \n",
        "\n",
        "                                    Predicted  \n",
-       "0  [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0]  \n",
+       "0  [0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]  \n",
        "1  [0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]  \n",
-       "2  [0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]  \n",
-       "3  [0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]  \n",
-       "4  [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0]  "
+       "2  [0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]  "
       ]
      },
-     "execution_count": 52,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1732,7 +1570,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1751,14 +1589,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 108,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "8000it [00:00, 189262.97it/s]\n"
+      "8000it [00:00, 175511.33it/s]\n"
      ]
     }
    ],
@@ -1793,7 +1631,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 109,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1806,7 +1644,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 110,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
@@ -1949,7 +1787,7 @@
        "[8000 rows x 4 columns]"
       ]
      },
-     "execution_count": 110,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1967,16 +1805,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 111,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataset.load_graphs(\"../features/semeval/semeval_train_4lang_graphs.pickle\")"
+    "dataset.load_graphs(\"../features/semeval/semeval_train_4lang_graphs.pickle\", binary=True)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 112,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1985,7 +1823,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 113,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1996,7 +1834,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 114,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2007,7 +1845,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 115,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [
     {
@@ -2022,7 +1860,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "6399it [00:27, 234.51it/s]\n"
+      "6399it [00:24, 261.28it/s]\n"
      ]
     },
     {
@@ -2066,20 +1904,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 76,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "[(['(u_3 / to  :2 (u_2 / entity2))'], [], 'Entity-Destination(e1,e2)'),\n",
-       " (['(u_15 / into  :2 (u_2 / entity2))'], [], 'Entity-Destination(e1,e2)'),\n",
+       " (['(u_8 / into  :2 (u_2 / entity2))'], [], 'Entity-Destination(e1,e2)'),\n",
        " (['(u_264 / place  :2 (u_25 / entity1))'], [], 'Entity-Destination(e1,e2)'),\n",
-       " (['(u_14 / in  :2 (u_2 / entity2))'], [], 'Entity-Destination(e1,e2)'),\n",
-       " (['(u_1200 / give  :2 (u_25 / entity1))'], [], 'Entity-Destination(e1,e2)')]"
+       " (['(u_19 / in  :2 (u_2 / entity2))'], [], 'Entity-Destination(e1,e2)'),\n",
+       " (['(u_1196 / give  :2 (u_25 / entity1))'], [], 'Entity-Destination(e1,e2)')]"
       ]
      },
-     "execution_count": 76,
+     "execution_count": 30,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2091,7 +1929,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 116,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2101,7 +1939,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 81,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [
     {
@@ -2141,7 +1979,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>[(u_15 / into  :2 (u_2 / entity2))]</td>\n",
+       "      <td>[(u_8 / into  :2 (u_2 / entity2))]</td>\n",
        "      <td>0.762172</td>\n",
        "      <td>0.628086</td>\n",
        "      <td>0.688663</td>\n",
@@ -2155,14 +1993,14 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>[(u_14 / in  :2 (u_2 / entity2))]</td>\n",
+       "      <td>[(u_19 / in  :2 (u_2 / entity2))]</td>\n",
        "      <td>0.117526</td>\n",
        "      <td>0.087963</td>\n",
        "      <td>0.100618</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>[(u_1200 / give  :2 (u_25 / entity1))]</td>\n",
+       "      <td>[(u_1196 / give  :2 (u_25 / entity1))]</td>\n",
        "      <td>0.533333</td>\n",
        "      <td>0.012346</td>\n",
        "      <td>0.024133</td>\n",
@@ -2174,13 +2012,13 @@
       "text/plain": [
        "                                  Feature  Precision    Recall    Fscore\n",
        "0        [(u_3 / to  :2 (u_2 / entity2))]   0.590909  0.200617  0.299539\n",
-       "1     [(u_15 / into  :2 (u_2 / entity2))]   0.762172  0.628086  0.688663\n",
+       "1      [(u_8 / into  :2 (u_2 / entity2))]   0.762172  0.628086  0.688663\n",
        "2  [(u_264 / place  :2 (u_25 / entity1))]   0.791667  0.058642  0.109195\n",
-       "3       [(u_14 / in  :2 (u_2 / entity2))]   0.117526  0.087963  0.100618\n",
-       "4  [(u_1200 / give  :2 (u_25 / entity1))]   0.533333  0.012346  0.024133"
+       "3       [(u_19 / in  :2 (u_2 / entity2))]   0.117526  0.087963  0.100618\n",
+       "4  [(u_1196 / give  :2 (u_25 / entity1))]   0.533333  0.012346  0.024133"
       ]
      },
-     "execution_count": 81,
+     "execution_count": 32,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2200,7 +2038,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 117,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2209,16 +2047,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 118,
+   "execution_count": 34,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "['(u_1 / into :1 (u_2 / arrive|landing|spread|fly|implant|flow|dump|invest|fetch|release|pour|introduce|leak|remove|add|migrate|insert|stuff|import|pack|transport|misplace) :2 (u_3 / entity2))']"
+       "['(u_1 / into :1 (u_2 / introduce|migrate|arrive|pour|misplace|import|transport|remove|dump|invest|stuff|flow|fly|add|leak|fetch|release|pack|insert|landing|spread|implant) :2 (u_3 / entity2))']"
       ]
      },
-     "execution_count": 118,
+     "execution_count": 34,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2229,7 +2067,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 119,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2238,7 +2076,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 120,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [
     {
@@ -2271,10 +2109,10 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>[(u_1 / into :1 (u_2 / arrive|landing|spread|f...</td>\n",
-       "      <td>0.988636</td>\n",
-       "      <td>0.268519</td>\n",
-       "      <td>0.42233</td>\n",
+       "      <td>[(u_1 / into :1 (u_2 / introduce|migrate|arriv...</td>\n",
+       "      <td>0.988701</td>\n",
+       "      <td>0.270062</td>\n",
+       "      <td>0.424242</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -2282,13 +2120,13 @@
       ],
       "text/plain": [
        "                                             Feature  Precision    Recall  \\\n",
-       "0  [(u_1 / into :1 (u_2 / arrive|landing|spread|f...   0.988636  0.268519   \n",
+       "0  [(u_1 / into :1 (u_2 / introduce|migrate|arriv...   0.988701  0.270062   \n",
        "\n",
-       "    Fscore  \n",
-       "0  0.42233  "
+       "     Fscore  \n",
+       "0  0.424242  "
       ]
      },
-     "execution_count": 120,
+     "execution_count": 36,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2306,13 +2144,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from xpotato.dataset.utils import save_dataframe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tr_df.to_pickle(\"semeval_train.pickle\")\n",
-    "val_df.to_pickle(\"semeval_val.pickle\")"
+    "save_dataframe(tr_df, 'semeval_train.tsv')\n",
+    "save_dataframe(val_df, 'semeval_val.tsv')"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -2320,7 +2174,7 @@
    "hash": "2ba6e79cfe500659b64dde21d0b13217ce6375f8dca9d4d575440e3878ce882b"
   },
   "kernelspec": {
-   "display_name": "Python 3.9.5 64-bit ('base': conda)",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -2335,8 +2189,7 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.9.5"
-  },
-  "orig_nbformat": 4
+  }
  },
  "nbformat": 4,
  "nbformat_minor": 2

From 8f082ad2411768968747584997a069c223c4ca64 Mon Sep 17 00:00:00 2001
From: Adam Kovacs <adaam.ko@gmail.com>
Date: Mon, 14 Feb 2022 14:03:35 +0100
Subject: [PATCH 06/15] Reverse directory

---
 features/botium/saved_features.json | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 features/botium/saved_features.json

diff --git a/features/botium/saved_features.json b/features/botium/saved_features.json
deleted file mode 100644
index 3fa66a6..0000000
--- a/features/botium/saved_features.json
+++ /dev/null
@@ -1 +0,0 @@
-{"OFF": [[["(u_2 / Erotik)"], [], "OFF"], [["(u_12 / blod.*)"], [], "OFF"], [["(u_1 / Schwein)"], [], "OFF"], [["(u_12 / Bloedsinn)"], [], "OFF"], [["(u_22 / sex)"], [], "OFF"], [["(u_1 / scheisse)"], [], "OFF"], [["(u_1 / dumm)"], [], "OFF"], [["(u_1 / sterben)"], [], "OFF"], [["(u1 / .*arsch.*)"], [], "OFF"], [["(u1 / leck)"], [], "OFF"], [["(u1 / toet.*)"], [], "OFF"], [["(u1 / du)", "(u2 / sein)"], [], "OFF"]]}
\ No newline at end of file

From a3274414ce6bd70062a8dc13f28796e6176e511f Mon Sep 17 00:00:00 2001
From: Adam Kovacs <adaam.ko@gmail.com>
Date: Tue, 8 Mar 2022 13:22:02 +0100
Subject: [PATCH 07/15] A rule class

---
 xpotato/graph_extractor/rule.py | 123 +++++++++++++++++++++++++++++++-
 1 file changed, 122 insertions(+), 1 deletion(-)

diff --git a/xpotato/graph_extractor/rule.py b/xpotato/graph_extractor/rule.py
index 2c13c1b..5a81b47 100644
--- a/xpotato/graph_extractor/rule.py
+++ b/xpotato/graph_extractor/rule.py
@@ -1,5 +1,126 @@
+import json
+from typing import Dict, List, Union
+
+
 class Rule:
-    def __init__(self, rule):
+    def __init__(self, rule, openie=False):
         self.positive_samples = rule[0]
         self.negative_samples = rule[1]
         self.label = rule[2]
+        self.openie = openie
+
+        self.marked_nodes = None
+        if openie:
+            self.marked_nodes = rule[3]
+
+    def __eq__(self, __o: object) -> bool:
+        if not isinstance(__o, Rule):
+            return False
+        return (
+            sorted(self.positive_samples) == sorted(__o.positive_samples)
+            and sorted(self.negative_samples) == sorted(__o.negative_samples.sort())
+            and self.label == __o.label
+        )
+
+    def to_list(self) -> List[List[Union[List[str], str]]]:
+        return (
+            [self.positive_samples, self.negative_samples, self.label]
+            if self.openie == False
+            else [
+                self.positive_samples,
+                self.negative_samples,
+                self.label,
+                self.marked_nodes,
+            ]
+        )
+
+
+class RuleSet:
+    def __init__(self, rules: List[Rule] = None):
+        if rules is None:
+            self.rules = []
+        else:
+            self.rules = rules
+
+    def __iter__(self):
+        for rule in self.rules:
+            yield rule
+
+    def __eq__(self, __o: object) -> bool:
+        if not isinstance(__o, RuleSet):
+            return False
+        return self.rules == __o.rules
+
+    def to_tsv(self, tsv_path: str):
+        with open(tsv_path, "w") as f:
+            for rule in self.rules:
+                positive_samples = ";".join(rule.positive_samples)
+                negative_samples = ";".join(rule.negative_samples)
+                label = rule.label
+                marked_nodes = rule.marked_nodes
+
+                rule_str = f"{label}\t{positive_samples}\t{negative_samples}"
+
+                if rule.openie:
+                    rule_str += f"\t{json.dumps(marked_nodes)}"
+                f.write(rule_str + "\n")
+
+    def from_tsv(self, tsv_path: str):
+        with open(tsv_path, "r") as f:
+            for line in f:
+                line = line.strip("\n")
+                line = line.split("\t")
+
+                positive_samples = [] if line[1] == "" else line[1].split(";")
+                negative_samples = [] if line[2] == "" else line[2].split(";")
+                label = line[0].strip()
+                rule = None
+                if len(line) == 3:
+                    rule = Rule(
+                        [positive_samples, negative_samples, label], openie=False
+                    )
+                elif len(line) == 4:
+                    marked_nodes = [] if line[3] == "" else json.loads(line[3])
+                    rule = Rule(
+                        [positive_samples, negative_samples, label, marked_nodes],
+                        openie=True,
+                    )
+                else:
+                    raise Exception(f"Invalid number of fields: {line}")
+
+                if not rule:
+                    raise Exception(f"Invalid rule: {line}")
+                self.add_rule(rule)
+
+    def from_dict(
+        self, rules: Dict[str, List[List[Union[List[str], str]]]], openie: bool = False
+    ):
+        for key, value in rules.items():
+            for rule in value:
+                self.add_rule(Rule(rule, openie=openie))
+
+    def from_json(self, json_path: str, openie: bool = False):
+        with open(json_path, "r") as f:
+            rules = json.load(f)
+
+        for key, value in rules.items():
+            for rule in value:
+                self.add_rule(Rule(rule, openie=openie))
+
+    def to_json(self, json_path: str):
+        with open(json_path, "w") as f:
+            json.dump(self.to_dict(), f)
+
+    def add_rule(self, rule: Rule):
+        self.rules.append(rule)
+
+    def to_dict(self) -> Dict[str, List[List[Union[List[str], str]]]]:
+        rule_dict = {rule.label: [] for rule in self.rules}
+
+        for rule in self.rules:
+            rule_dict[rule.label].append(rule.to_list())
+
+        return rule_dict
+
+    def to_list(self) -> List[List[Union[List[str], str]]]:
+        return [rule.to_list() for rule in self.rules]

From 5e0c4d3b7c6d4d77c51370fc601474f607a4f50e Mon Sep 17 00:00:00 2001
From: Adam Kovacs <adaam.ko@gmail.com>
Date: Tue, 8 Mar 2022 13:22:24 +0100
Subject: [PATCH 08/15] Potato can now handle OpenIE

---
 xpotato/graph_extractor/extract.py | 92 +++++++++++++++++++++++++++---
 1 file changed, 83 insertions(+), 9 deletions(-)

diff --git a/xpotato/graph_extractor/extract.py b/xpotato/graph_extractor/extract.py
index f4ffb51..29a84f5 100644
--- a/xpotato/graph_extractor/extract.py
+++ b/xpotato/graph_extractor/extract.py
@@ -5,6 +5,7 @@
 import networkx as nx
 import pandas as pd
 import stanza
+import penman as pn
 from networkx.algorithms.isomorphism import DiGraphMatcher
 from sklearn.metrics import precision_recall_fscore_support
 from tqdm import tqdm
@@ -40,7 +41,9 @@ def init_nlp(self):
                 nlp = stanza.Pipeline(self.lang)
             self.nlp = CachedStanzaPipeline(nlp, self.cache_fn)
 
-    def parse_iterable(self, iterable, graph_type="fourlang"):
+    def parse_iterable(self, iterable, graph_type="fourlang", lang=None):
+        if lang:
+            self.lang = lang
         if graph_type == "fourlang":
             with TextTo4lang(
                 lang=self.lang, nlp_cache=self.cache_fn, cache_dir=self.cache_dir
@@ -74,50 +77,121 @@ class FeatureEvaluator:
     def __init__(self, graph_format="ud"):
         self.graph_format = graph_format
 
-    def match_features(self, dataset, features, multi=False):
+    # ADAM: Very important to assign IDs to features from 0 because that's how
+    # the mapping will work!!
+    def annotate(self, graph, features):
+        feature_to_marked_nodes = {}
+
+        for i, feature in enumerate(features):
+            assert (
+                len(feature) == 4
+            ), f"Feature must be a 4-tuple for OpenIE, not {feature}"
+
+            positive_features = feature[0]
+            negative_features = feature[1]
+
+            for positive in positive_features:
+                p = pn.decode(positive)
+                first = p.triples[0][0]
+                assert first == "u_0", f"The IDs must start from 0, not {first}"
+
+            for negative in negative_features:
+                p = pn.decode(negative)
+                first = p.triples[0][0]
+                assert first == "u_0", f"The IDs must start from 0, not {first}"
+
+            feature_to_marked_nodes[i] = feature[3]
+            features[i] = feature[:3]
+
+        matcher = GraphFormulaMatcher(features, converter=default_pn_to_graph)
+        feats = matcher.match(graph, return_subgraphs=True)
+
+        for key, i, subgraphs in feats:
+            triplet = {"relation": key}
+            marked_nodes = feature_to_marked_nodes[i]
+            for j, node in enumerate(marked_nodes):
+                subgraph = subgraphs[j]
+
+                node_to_node = {}
+                for id, graph_node in subgraph.nodes(data=True):
+                    mapping = graph_node["mapping"]
+                    node_to_node[mapping] = graph_node["name"]
+
+                for k, v in node.items():
+                    triplet[k] = node_to_node[v]
+
+                yield triplet
+
+    def annotate_dataframe(self, dataset, features):
+        graphs = dataset.graph.tolist()
+
+        triplets = []
+        for graph in graphs:
+            relations = self.annotate(graph, features)
+            triplets.append(list(relations))
+        d = {
+            "Sentence": dataset.text.tolist(),
+            "Triplets": triplets,
+        }
+
+        return pd.DataFrame(d)
+
+    def match_features(self, dataset, features, multi=False, return_subgraphs=False):
         graphs = dataset.graph.tolist()
 
         matches = []
         predicted = []
+        matched_graphs = []
 
         matcher = GraphFormulaMatcher(features, converter=default_pn_to_graph)
 
         for i, g in tqdm(enumerate(graphs)):
-            feats = matcher.match(g)
+            feats = matcher.match(g, return_subgraphs=return_subgraphs)
             if multi:
-                self.match_multi(feats, features, matches, predicted)
+                self.match_multi(feats, features, matches, predicted, matched_graphs)
             else:
-                self.match_not_multi(feats, features, matches, predicted)
+                self.match_not_multi(
+                    feats, features, matches, predicted, matched_graphs
+                )
 
         d = {
             "Sentence": dataset.text.tolist(),
             "Predicted label": predicted,
             "Matched rule": matches,
         }
+        if return_subgraphs:
+            d["Matched subgraph"] = matched_graphs
+
         df = pd.DataFrame(d)
         return df
 
-    def match_multi(self, feats, features, matches, predicted):
+    def match_multi(self, feats, features, matches, predicted, matched_graphs):
         keys = []
         matched_rules = []
-        for key, feature in feats:
+        matched_subgraphs = []
+        for key, feature, graphs in feats:
             if key not in keys:
                 matched_rules.append(features[feature])
+                matched_subgraphs.append(graphs)
                 keys.append(key)
         if not keys:
             matches.append("")
             predicted.append("")
+            graphs.append("")
         else:
             matches.append(matched_rules)
             predicted.append(keys)
+            matched_graphs.append(matched_subgraphs)
 
-    def match_not_multi(self, feats, features, matches, predicted):
-        for key, feature in feats:
+    def match_not_multi(self, feats, features, matches, predicted, matched_graphs):
+        for key, feature, graphs in feats:
             matches.append(features[feature])
             predicted.append(key)
+            matched_graphs.append(graphs)
             break
         else:
             matches.append("")
+            matched_graphs.append("")
             predicted.append("")
 
     def one_versus_rest(self, df, entity):

From 47186f2fe073c2abc1a8d8c465801d8a9ca00e8d Mon Sep 17 00:00:00 2001
From: Adam Kovacs <adaam.ko@gmail.com>
Date: Tue, 8 Mar 2022 13:22:38 +0100
Subject: [PATCH 09/15] Unittest for rules

---
 tests/test_ruleset.py | 102 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)
 create mode 100644 tests/test_ruleset.py

diff --git a/tests/test_ruleset.py b/tests/test_ruleset.py
new file mode 100644
index 0000000..cc42850
--- /dev/null
+++ b/tests/test_ruleset.py
@@ -0,0 +1,102 @@
+from xpotato.dataset.utils import default_pn_to_graph
+from xpotato.graph_extractor.extract import FeatureEvaluator
+from xpotato.graph_extractor.rule import RuleSet, Rule
+
+import os
+
+dir_name = os.path.dirname(os.path.realpath(__file__))
+
+FEATURE1 = [["(u_0 / find :obl (u_1 / .*) :obj (u_2 / .*))"], [], "FIND"]
+FEATURE2 = [["(u_0 / write :nsubj (u_1 / .*) :obj (u_2 / .*))"], [], "WRITE"]
+
+FEATURE3 = [
+    ["(u_0 / find :obl (u_1 / .*) :obj (u_2 / .*))"],
+    [],
+    "FIND",
+    [{"ARG1": 1, "ARG2": 2}],
+]
+FEATURE4 = [
+    ["(u_0 / write :nsubj (u_1 / .*) :obj (u_2 / .*))"],
+    [],
+    "WRITE",
+    [{"ARG1": 1, "ARG2": 2}],
+]
+
+FEATURE_DICT = {
+    "FIND": [[["(u_0 / find :obl (u_1 / .*) :obj (u_2 / .*))"], [], "FIND"]],
+    "WRITE": [[["(u_0 / write :nsubj (u_1 / .*) :obj (u_2 / .*))"], [], "WRITE"]],
+}
+
+GRAPH = "(u_2 / write  :nsubj (u_1 / person)  :obj (u_4 / sentence  :det (u_3 / this))  :conj (u_6 / find  :cc (u_5 / and)  :obj (u_7 / object)  :obl (u_9 / location  :case (u_8 / in)))  :punct (u_10 / PERIOD)  :root-of (u_0 / root))"
+
+
+def test_rule():
+    rule1 = Rule(FEATURE1)
+
+    assert rule1.to_list() == FEATURE1
+
+
+def test_ruleset_to_list():
+    rule_set = RuleSet([Rule(FEATURE1)])
+
+    rule_set.add_rule(Rule(FEATURE2))
+
+    assert rule_set.to_list() == [FEATURE1, FEATURE2]
+
+
+def test_ruleset_to_dict():
+    rule_set = RuleSet([Rule(FEATURE1), Rule(FEATURE2)])
+
+    assert rule_set.to_dict() == {"FIND": [FEATURE1], "WRITE": [FEATURE2]}
+
+
+def test_ruleset_from_dict_to_list():
+    rule_set = RuleSet()
+    rule_set.from_dict(FEATURE_DICT)
+
+    assert rule_set.to_list() == [FEATURE1, FEATURE2]
+
+
+def test_ruleset_json():
+    rule_set = RuleSet()
+
+    rule_set.from_json(os.path.join(dir_name, "features.json"))
+
+    assert rule_set.to_list() == [FEATURE1, FEATURE2]
+
+
+def test_ruleset_to_tsv():
+    rule_set = RuleSet([Rule(FEATURE1), Rule(FEATURE2)])
+
+    rule_set.to_tsv(os.path.join(dir_name, "features.tsv"))
+
+    rule_set = RuleSet()
+
+    rule_set.from_tsv(os.path.join(dir_name, "features.tsv"))
+
+    assert rule_set.to_list() == [FEATURE1, FEATURE2]
+
+
+def test_ruleset_openie():
+    rule_set = RuleSet([Rule(FEATURE3, openie=True), Rule(FEATURE4, openie=True)])
+    rule_set.to_tsv(os.path.join(dir_name, "features_openie.tsv"))
+
+    rule_set = RuleSet()
+    rule_set.from_tsv(os.path.join(dir_name, "features_openie.tsv"))
+
+    assert rule_set.to_list() == [FEATURE3, FEATURE4]
+
+
+def test_openie_matching():
+    evaluator = FeatureEvaluator()
+
+    G, _ = default_pn_to_graph(GRAPH)
+
+    rule_set = RuleSet([Rule(FEATURE3, openie=True), Rule(FEATURE4, openie=True)])
+
+    triplets = list(evaluator.annotate(G, rule_set.to_list()))
+
+    assert triplets == [
+        {"relation": "FIND", "ARG1": "location", "ARG2": "object"},
+        {"relation": "WRITE", "ARG1": "person", "ARG2": "sentence"},
+    ]

From 719dd74741f4c95242699cd7dd93890ea6a3b87c Mon Sep 17 00:00:00 2001
From: Adam Kovacs <adaam.ko@gmail.com>
Date: Tue, 8 Mar 2022 13:57:53 +0100
Subject: [PATCH 10/15] Json and TSV both work

---
 frontend/app.py   | 12 +++++-------
 frontend/utils.py | 30 +++++++++++++++++++++++++-----
 2 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/frontend/app.py b/frontend/app.py
index fa442ca..c2668a8 100644
--- a/frontend/app.py
+++ b/frontend/app.py
@@ -26,6 +26,7 @@
     rerun,
     rule_chooser,
     save_ruleset,
+    read_ruleset,
     save_after_modify,
     save_dataframe,
     match_texts,
@@ -61,8 +62,7 @@ def inference_mode(evaluator, hand_made_rules):
     st.session_state.download = st.sidebar.selectbox("", options=[False, True], key=2)
 
     if hand_made_rules:
-        with open(hand_made_rules) as f:
-            st.session_state.features = json.load(f)
+        read_ruleset(hand_made_rules)
 
     extractor = init_extractor(lang, graph_format)
 
@@ -180,7 +180,7 @@ def inference_mode(evaluator, hand_made_rules):
                         [";".join(feat[1]) for feat in features_merged],
                         [feat[2] for feat in features_merged],
                     )
-                    save_rules = hand_made_rules or "saved_features.json"
+                    save_rules = hand_made_rules or "saved_features.tsv"
                     save_ruleset(save_rules, st.session_state.features)
                     rerun()
 
@@ -225,8 +225,7 @@ def inference_mode(evaluator, hand_made_rules):
 
 def simple_mode(evaluator, data, val_data, graph_format, feature_path, hand_made_rules):
     if hand_made_rules:
-        with open(hand_made_rules) as f:
-            st.session_state.features = json.load(f)
+        read_ruleset(hand_made_rules)
 
     if "df" not in st.session_state:
         st.session_state.df = data.copy()
@@ -635,8 +634,7 @@ def simple_mode(evaluator, data, val_data, graph_format, feature_path, hand_made
 def advanced_mode(evaluator, train_data, graph_format, feature_path, hand_made_rules):
     data = read_df(train_data)
     if hand_made_rules:
-        with open(hand_made_rules) as f:
-            st.session_state.features = json.load(f)
+        read_ruleset(hand_made_rules)
     if "df" not in st.session_state:
         st.session_state.df = data.copy()
         if "annotated" not in st.session_state.df:
diff --git a/frontend/utils.py b/frontend/utils.py
index 77d36e6..f2adfa2 100644
--- a/frontend/utils.py
+++ b/frontend/utils.py
@@ -17,6 +17,7 @@
 from xpotato.graph_extractor.extract import FeatureEvaluator, GraphExtractor
 from xpotato.models.trainer import GraphTrainer
 from xpotato.dataset.utils import default_pn_to_graph
+from xpotato.graph_extractor.rule import RuleSet, Rule
 from tuw_nlp.graph.utils import GraphFormulaMatcher, graph_to_pn
 
 from contextlib import contextmanager
@@ -157,8 +158,27 @@ def to_dot(graph, marked_nodes=set(), integ=False):
 
 
 def save_ruleset(path, features):
-    with open(path, "w+") as f:
-        json.dump(features, f)
+    rule_set = RuleSet()
+    rule_set.from_dict(features)
+    
+    if path.endswith(".json"):
+        rule_set.to_json(path)
+    elif path.endswith(".tsv"):
+        rule_set.to_tsv(path)
+    else:
+        raise ValueError("Unknown file extension, currently only .json and .tsv are supported")
+
+def read_ruleset(path):
+    rule_set = RuleSet()
+
+    if path.endswith(".json"):
+        rule_set.from_json(path)
+    elif path.endswith(".tsv"):
+        rule_set.from_tsv(path)
+    else:
+        raise ValueError("Unknown file extension, currently only .json and .tsv are supported")
+
+    st.session_state.features = rule_set.to_dict()
 
 
 def d_clean(string):
@@ -218,7 +238,7 @@ def save_after_modify(hand_made_rules, classes=None):
             [feat[2] for feat in features_merged],
         )
 
-    save_rules = hand_made_rules or "saved_features.json"
+    save_rules = hand_made_rules or "saved_features.tsv"
     save_ruleset(save_rules, st.session_state.features)
     st.session_state.rows_to_delete = []
     rerun()
@@ -360,7 +380,7 @@ def show_ml_feature(classes, hand_made_rules):
                 [";".join(feat[0]) for feat in st.session_state.features[classes]],
                 [";".join(feat[1]) for feat in st.session_state.features[classes]],
             )
-            save_rules = hand_made_rules or "saved_features.json"
+            save_rules = hand_made_rules or "saved_features.tsv"
             save_ruleset(save_rules, st.session_state.features)
             rerun()
 
@@ -532,7 +552,7 @@ def add_rule_manually(classes, hand_made_rules):
                 [";".join(feat[0]) for feat in st.session_state.features[classes]],
                 [";".join(feat[1]) for feat in st.session_state.features[classes]],
             )
-            save_rules = hand_made_rules or "saved_features.json"
+            save_rules = hand_made_rules or "saved_features.tsv"
             save_ruleset(save_rules, st.session_state.features)
             rerun()
     st.markdown(

From 4f117db3e715f982924d2bfbda356ca82fde62c8 Mon Sep 17 00:00:00 2001
From: Adam Kovacs <adaam.ko@gmail.com>
Date: Tue, 8 Mar 2022 13:58:36 +0100
Subject: [PATCH 11/15] Test files

---
 tests/features.json       | 1 +
 tests/features.tsv        | 2 ++
 tests/features_openie.tsv | 2 ++
 3 files changed, 5 insertions(+)
 create mode 100644 tests/features.json
 create mode 100644 tests/features.tsv
 create mode 100644 tests/features_openie.tsv

diff --git a/tests/features.json b/tests/features.json
new file mode 100644
index 0000000..7862bc2
--- /dev/null
+++ b/tests/features.json
@@ -0,0 +1 @@
+{"FIND": [[["(u_0 / find :obl (u_1 / .*) :obj (u_2 / .*))"], [], "FIND"]], "WRITE": [[["(u_0 / write :nsubj (u_1 / .*) :obj (u_2 / .*))"], [], "WRITE"]]}
\ No newline at end of file
diff --git a/tests/features.tsv b/tests/features.tsv
new file mode 100644
index 0000000..4175f6c
--- /dev/null
+++ b/tests/features.tsv
@@ -0,0 +1,2 @@
+FIND	(u_0 / find :obl (u_1 / .*) :obj (u_2 / .*))	
+WRITE	(u_0 / write :nsubj (u_1 / .*) :obj (u_2 / .*))	
diff --git a/tests/features_openie.tsv b/tests/features_openie.tsv
new file mode 100644
index 0000000..47ccf01
--- /dev/null
+++ b/tests/features_openie.tsv
@@ -0,0 +1,2 @@
+FIND	(u_0 / find :obl (u_1 / .*) :obj (u_2 / .*))		[{"ARG1": 1, "ARG2": 2}]
+WRITE	(u_0 / write :nsubj (u_1 / .*) :obj (u_2 / .*))		[{"ARG1": 1, "ARG2": 2}]

From f060150b21d491865cc477e8e1dcbd895eb7883a Mon Sep 17 00:00:00 2001
From: Adam Kovacs <adaam.ko@gmail.com>
Date: Tue, 8 Mar 2022 14:32:43 +0100
Subject: [PATCH 12/15] Little formatting

---
 frontend/utils.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/frontend/utils.py b/frontend/utils.py
index f2adfa2..19b9461 100644
--- a/frontend/utils.py
+++ b/frontend/utils.py
@@ -160,13 +160,16 @@ def to_dot(graph, marked_nodes=set(), integ=False):
 def save_ruleset(path, features):
     rule_set = RuleSet()
     rule_set.from_dict(features)
-    
+
     if path.endswith(".json"):
         rule_set.to_json(path)
     elif path.endswith(".tsv"):
         rule_set.to_tsv(path)
     else:
-        raise ValueError("Unknown file extension, currently only .json and .tsv are supported")
+        raise ValueError(
+            "Unknown file extension, currently only .json and .tsv are supported"
+        )
+
 
 def read_ruleset(path):
     rule_set = RuleSet()
@@ -176,7 +179,9 @@ def read_ruleset(path):
     elif path.endswith(".tsv"):
         rule_set.from_tsv(path)
     else:
-        raise ValueError("Unknown file extension, currently only .json and .tsv are supported")
+        raise ValueError(
+            "Unknown file extension, currently only .json and .tsv are supported"
+        )
 
     st.session_state.features = rule_set.to_dict()
 

From f42a70a3fa7ed369c730b52e55805cb86d6d3dfe Mon Sep 17 00:00:00 2001
From: Adam Kovacs <adaam.ko@gmail.com>
Date: Tue, 8 Mar 2022 14:32:56 +0100
Subject: [PATCH 13/15] OpenIE notebook

---
 notebooks/openie.ipynb | 399 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 399 insertions(+)
 create mode 100644 notebooks/openie.ipynb

diff --git a/notebooks/openie.ipynb b/notebooks/openie.ipynb
new file mode 100644
index 0000000..bf781e1
--- /dev/null
+++ b/notebooks/openie.ipynb
@@ -0,0 +1,399 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "17da8e39",
+   "metadata": {},
+   "source": [
+    "## Minimal example of OpenIE"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e8f2d686",
+   "metadata": {},
+   "source": [
+    "First define the sentence you want to annotate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "c8c701b1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentences = ['This property is defined by ComponentType defined in OPC UA DI.', \n",
+    "            'The IMachineryItemVendorNameplateType is a subtype of the 2:IVendorNameplateType defined in OPC 10000-100.']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "7d7cfb0f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import the evaluators\n",
+    "from xpotato.graph_extractor.extract import GraphExtractor, FeatureEvaluator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "51a416b1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "evaluator = FeatureEvaluator()\n",
+    "extractor = GraphExtractor(cache_fn='openie_en')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "dde383b7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-03-08 14:20:00 INFO: Loading these models for language: en (English):\n",
+      "============================\n",
+      "| Processor    | Package   |\n",
+      "----------------------------\n",
+      "| tokenize     | combined  |\n",
+      "| pos          | combined  |\n",
+      "| lemma        | combined  |\n",
+      "| depparse     | combined  |\n",
+      "| sentiment    | sstplus   |\n",
+      "| constituency | wsj       |\n",
+      "| ner          | ontonotes |\n",
+      "============================\n",
+      "\n",
+      "2022-03-08 14:20:00,550 : core (112) - INFO - Loading these models for language: en (English):\n",
+      "============================\n",
+      "| Processor    | Package   |\n",
+      "----------------------------\n",
+      "| tokenize     | combined  |\n",
+      "| pos          | combined  |\n",
+      "| lemma        | combined  |\n",
+      "| depparse     | combined  |\n",
+      "| sentiment    | sstplus   |\n",
+      "| constituency | wsj       |\n",
+      "| ner          | ontonotes |\n",
+      "============================\n",
+      "\n",
+      "2022-03-08 14:20:00 INFO: Use device: cpu\n",
+      "2022-03-08 14:20:00,552 : core (123) - INFO - Use device: cpu\n",
+      "2022-03-08 14:20:00 INFO: Loading: tokenize\n",
+      "2022-03-08 14:20:00,553 : core (129) - INFO - Loading: tokenize\n",
+      "2022-03-08 14:20:00 INFO: Loading: pos\n",
+      "2022-03-08 14:20:00,562 : core (129) - INFO - Loading: pos\n",
+      "2022-03-08 14:20:00 INFO: Loading: lemma\n",
+      "2022-03-08 14:20:00,821 : core (129) - INFO - Loading: lemma\n",
+      "2022-03-08 14:20:01 INFO: Loading: depparse\n",
+      "2022-03-08 14:20:01,039 : core (129) - INFO - Loading: depparse\n",
+      "2022-03-08 14:20:01 INFO: Loading: sentiment\n",
+      "2022-03-08 14:20:01,397 : core (129) - INFO - Loading: sentiment\n",
+      "2022-03-08 14:20:01 INFO: Loading: constituency\n",
+      "2022-03-08 14:20:01,807 : core (129) - INFO - Loading: constituency\n",
+      "2022-03-08 14:20:02 INFO: Loading: ner\n",
+      "2022-03-08 14:20:02,258 : core (129) - INFO - Loading: ner\n",
+      "2022-03-08 14:20:02 INFO: Done loading processors!\n",
+      "2022-03-08 14:20:02,669 : core (179) - INFO - Done loading processors!\n",
+      "2022-03-08 14:20:02,671 : pipeline (45) - INFO - creating new NLP cache in openie_en\n",
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.09it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Parse the sentences to graphs\n",
+    "graphs = list(extractor.parse_iterable(sentences, graph_type='ud', lang='en'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "1b121538",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/svg+xml": [
+       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
+       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
+       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
+       "<!-- Generated by graphviz version 2.43.0 (0)\n",
+       " -->\n",
+       "<!-- Title: finite_state_machine Pages: 1 -->\n",
+       "<svg width=\"492pt\" height=\"556pt\"\n",
+       " viewBox=\"0.00 0.00 506.45 571.76\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
+       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1.03 1.03) rotate(0) translate(4 567.76)\">\n",
+       "<title>finite_state_machine</title>\n",
+       "<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-567.76 502.45,-567.76 502.45,4 -4,4\"/>\n",
+       "<!-- ComponentType -->\n",
+       "<g id=\"node1\" class=\"node\">\n",
+       "<title>ComponentType</title>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"154.45\" cy=\"-477.32\" rx=\"86.38\" ry=\"86.38\"/>\n",
+       "<text text-anchor=\"middle\" x=\"154.45\" y=\"-473.62\" font-family=\"Times,serif\" font-size=\"14.00\">ComponentType</text>\n",
+       "</g>\n",
+       "<!-- by -->\n",
+       "<g id=\"node7\" class=\"node\">\n",
+       "<title>by</title>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"122.45\" cy=\"-300.88\" rx=\"22.2\" ry=\"22.2\"/>\n",
+       "<text text-anchor=\"middle\" x=\"122.45\" y=\"-297.18\" font-family=\"Times,serif\" font-size=\"14.00\">by</text>\n",
+       "</g>\n",
+       "<!-- ComponentType&#45;&gt;by -->\n",
+       "<g id=\"edge1\" class=\"edge\">\n",
+       "<title>ComponentType&#45;&gt;by</title>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M127.78,-394.93C126.05,-387.53 124.56,-380.11 123.45,-372.88 121.48,-360.06 120.96,-345.79 121.03,-333.43\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"124.53,-333.2 121.22,-323.13 117.53,-333.07 124.53,-333.2\"/>\n",
+       "<text text-anchor=\"middle\" x=\"139.45\" y=\"-361.68\" font-family=\"Times,serif\" font-size=\"14.00\">case</text>\n",
+       "</g>\n",
+       "<!-- define -->\n",
+       "<g id=\"node8\" class=\"node\">\n",
+       "<title>define</title>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"221.45\" cy=\"-300.88\" rx=\"38.99\" ry=\"38.99\"/>\n",
+       "<text text-anchor=\"middle\" x=\"221.45\" y=\"-297.18\" font-family=\"Times,serif\" font-size=\"14.00\">define</text>\n",
+       "</g>\n",
+       "<!-- ComponentType&#45;&gt;define -->\n",
+       "<g id=\"edge2\" class=\"edge\">\n",
+       "<title>ComponentType&#45;&gt;define</title>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M166.67,-391.67C169.76,-380.04 173.62,-368.48 178.45,-357.88 181.45,-351.28 185.36,-344.75 189.6,-338.6\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"192.46,-340.62 195.53,-330.48 186.8,-336.49 192.46,-340.62\"/>\n",
+       "<text text-anchor=\"middle\" x=\"188.95\" y=\"-361.68\" font-family=\"Times,serif\" font-size=\"14.00\">acl</text>\n",
+       "</g>\n",
+       "<!-- DI -->\n",
+       "<g id=\"node2\" class=\"node\">\n",
+       "<title>DI</title>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"21.45\" cy=\"-28.6\" rx=\"21.4\" ry=\"21.4\"/>\n",
+       "<text text-anchor=\"middle\" x=\"21.45\" y=\"-24.9\" font-family=\"Times,serif\" font-size=\"14.00\">DI</text>\n",
+       "</g>\n",
+       "<!-- OPC -->\n",
+       "<g id=\"node3\" class=\"node\">\n",
+       "<title>OPC</title>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"85.45\" cy=\"-159.54\" rx=\"31.4\" ry=\"31.4\"/>\n",
+       "<text text-anchor=\"middle\" x=\"85.45\" y=\"-155.84\" font-family=\"Times,serif\" font-size=\"14.00\">OPC</text>\n",
+       "</g>\n",
+       "<!-- OPC&#45;&gt;DI -->\n",
+       "<g id=\"edge3\" class=\"edge\">\n",
+       "<title>OPC&#45;&gt;DI</title>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M71.88,-131.19C61.07,-109.43 46.02,-79.1 35.17,-57.25\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"38.25,-55.57 30.66,-48.17 31.98,-58.68 38.25,-55.57\"/>\n",
+       "<text text-anchor=\"middle\" x=\"62.45\" y=\"-78.99\" font-family=\"Times,serif\" font-size=\"14.00\">flat</text>\n",
+       "</g>\n",
+       "<!-- UA -->\n",
+       "<g id=\"node5\" class=\"node\">\n",
+       "<title>UA</title>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"85.45\" cy=\"-28.6\" rx=\"24.9\" ry=\"24.9\"/>\n",
+       "<text text-anchor=\"middle\" x=\"85.45\" y=\"-24.9\" font-family=\"Times,serif\" font-size=\"14.00\">UA</text>\n",
+       "</g>\n",
+       "<!-- OPC&#45;&gt;UA -->\n",
+       "<g id=\"edge4\" class=\"edge\">\n",
+       "<title>OPC&#45;&gt;UA</title>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M85.45,-128.31C85.45,-109 85.45,-83.92 85.45,-63.87\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"88.95,-63.62 85.45,-53.62 81.95,-63.62 88.95,-63.62\"/>\n",
+       "<text text-anchor=\"middle\" x=\"97.45\" y=\"-78.99\" font-family=\"Times,serif\" font-size=\"14.00\">flat</text>\n",
+       "</g>\n",
+       "<!-- in -->\n",
+       "<g id=\"node9\" class=\"node\">\n",
+       "<title>in</title>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"147.45\" cy=\"-28.6\" rx=\"19.5\" ry=\"19.5\"/>\n",
+       "<text text-anchor=\"middle\" x=\"147.45\" y=\"-24.9\" font-family=\"Times,serif\" font-size=\"14.00\">in</text>\n",
+       "</g>\n",
+       "<!-- OPC&#45;&gt;in -->\n",
+       "<g id=\"edge5\" class=\"edge\">\n",
+       "<title>OPC&#45;&gt;in</title>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M98.6,-131.19C109.31,-108.91 124.34,-77.66 134.89,-55.72\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"138.07,-57.17 139.25,-46.65 131.76,-54.14 138.07,-57.17\"/>\n",
+       "<text text-anchor=\"middle\" x=\"140.45\" y=\"-78.99\" font-family=\"Times,serif\" font-size=\"14.00\">case</text>\n",
+       "</g>\n",
+       "<!-- PERIOD -->\n",
+       "<g id=\"node4\" class=\"node\">\n",
+       "<title>PERIOD</title>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"183.45\" cy=\"-159.54\" rx=\"48.99\" ry=\"48.99\"/>\n",
+       "<text text-anchor=\"middle\" x=\"183.45\" y=\"-155.84\" font-family=\"Times,serif\" font-size=\"14.00\">PERIOD</text>\n",
+       "</g>\n",
+       "<!-- be -->\n",
+       "<g id=\"node6\" class=\"node\">\n",
+       "<title>be</title>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"272.45\" cy=\"-159.54\" rx=\"22.2\" ry=\"22.2\"/>\n",
+       "<text text-anchor=\"middle\" x=\"272.45\" y=\"-155.84\" font-family=\"Times,serif\" font-size=\"14.00\">be</text>\n",
+       "</g>\n",
+       "<!-- define&#45;&gt;ComponentType -->\n",
+       "<g id=\"edge6\" class=\"edge\">\n",
+       "<title>define&#45;&gt;ComponentType</title>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M210.9,-338.62C207.5,-349.72 203.53,-361.87 199.45,-372.88 197.53,-378.04 195.49,-383.32 193.38,-388.62\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"190.02,-387.6 189.52,-398.18 196.51,-390.22 190.02,-387.6\"/>\n",
+       "<text text-anchor=\"middle\" x=\"215.45\" y=\"-361.68\" font-family=\"Times,serif\" font-size=\"14.00\">obl</text>\n",
+       "</g>\n",
+       "<!-- define&#45;&gt;OPC -->\n",
+       "<g id=\"edge7\" class=\"edge\">\n",
+       "<title>define&#45;&gt;OPC</title>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M192.22,-274.87C172.62,-257.61 146.61,-233.72 125.45,-210.89 120.28,-205.31 115.04,-199.14 110.12,-193.09\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"112.62,-190.61 103.65,-184.97 107.15,-194.98 112.62,-190.61\"/>\n",
+       "<text text-anchor=\"middle\" x=\"167.45\" y=\"-232.69\" font-family=\"Times,serif\" font-size=\"14.00\">obl</text>\n",
+       "</g>\n",
+       "<!-- define&#45;&gt;PERIOD -->\n",
+       "<g id=\"edge8\" class=\"edge\">\n",
+       "<title>define&#45;&gt;PERIOD</title>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M204.38,-265.48C201.39,-258.46 198.56,-251.03 196.45,-243.89 194.03,-235.71 192.04,-226.94 190.41,-218.27\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"193.82,-217.44 188.65,-208.19 186.92,-218.64 193.82,-217.44\"/>\n",
+       "<text text-anchor=\"middle\" x=\"216.95\" y=\"-232.69\" font-family=\"Times,serif\" font-size=\"14.00\">punct</text>\n",
+       "</g>\n",
+       "<!-- define&#45;&gt;be -->\n",
+       "<g id=\"edge9\" class=\"edge\">\n",
+       "<title>define&#45;&gt;be</title>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M234.58,-264C242.94,-241.17 253.63,-211.94 261.52,-190.38\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"264.89,-191.37 265.04,-180.77 258.32,-188.96 264.89,-191.37\"/>\n",
+       "<text text-anchor=\"middle\" x=\"302.45\" y=\"-232.69\" font-family=\"Times,serif\" font-size=\"14.00\">auxCOLONpass</text>\n",
+       "</g>\n",
+       "<!-- property -->\n",
+       "<g id=\"node10\" class=\"node\">\n",
+       "<title>property</title>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"394.45\" cy=\"-159.54\" rx=\"51.19\" ry=\"51.19\"/>\n",
+       "<text text-anchor=\"middle\" x=\"394.45\" y=\"-155.84\" font-family=\"Times,serif\" font-size=\"14.00\">property</text>\n",
+       "</g>\n",
+       "<!-- define&#45;&gt;property -->\n",
+       "<g id=\"edge10\" class=\"edge\">\n",
+       "<title>define&#45;&gt;property</title>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M259.98,-293.41C291.33,-286.28 334.76,-271.89 362.45,-243.89 369.32,-236.93 374.75,-228.34 379.03,-219.35\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"382.37,-220.45 383.09,-209.88 375.93,-217.7 382.37,-220.45\"/>\n",
+       "<text text-anchor=\"middle\" x=\"435.45\" y=\"-232.69\" font-family=\"Times,serif\" font-size=\"14.00\">nsubjCOLONpass</text>\n",
+       "</g>\n",
+       "<!-- this -->\n",
+       "<g id=\"node12\" class=\"node\">\n",
+       "<title>this</title>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"394.45\" cy=\"-28.6\" rx=\"28.7\" ry=\"28.7\"/>\n",
+       "<text text-anchor=\"middle\" x=\"394.45\" y=\"-24.9\" font-family=\"Times,serif\" font-size=\"14.00\">this</text>\n",
+       "</g>\n",
+       "<!-- property&#45;&gt;this -->\n",
+       "<g id=\"edge11\" class=\"edge\">\n",
+       "<title>property&#45;&gt;this</title>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M394.45,-107.95C394.45,-94.5 394.45,-80.2 394.45,-67.56\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"397.95,-67.38 394.45,-57.38 390.95,-67.38 397.95,-67.38\"/>\n",
+       "<text text-anchor=\"middle\" x=\"406.45\" y=\"-78.99\" font-family=\"Times,serif\" font-size=\"14.00\">det</text>\n",
+       "</g>\n",
+       "<!-- root -->\n",
+       "<g id=\"node11\" class=\"node\">\n",
+       "<title>root</title>\n",
+       "<ellipse fill=\"none\" stroke=\"black\" cx=\"288.45\" cy=\"-477.32\" rx=\"29.8\" ry=\"29.8\"/>\n",
+       "<text text-anchor=\"middle\" x=\"288.45\" y=\"-473.62\" font-family=\"Times,serif\" font-size=\"14.00\">root</text>\n",
+       "</g>\n",
+       "<!-- root&#45;&gt;define -->\n",
+       "<g id=\"edge12\" class=\"edge\">\n",
+       "<title>root&#45;&gt;define</title>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M277.98,-449.07C267.54,-421.88 251.32,-379.65 238.92,-347.38\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"242.06,-345.79 235.21,-337.71 235.53,-348.3 242.06,-345.79\"/>\n",
+       "<text text-anchor=\"middle\" x=\"261.45\" y=\"-361.68\" font-family=\"Times,serif\" font-size=\"14.00\">root</text>\n",
+       "</g>\n",
+       "</g>\n",
+       "</svg>\n"
+      ],
+      "text/plain": [
+       "<graphviz.sources.Source at 0x7f4ae99ebf10>"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# We can also check any of the graphs\n",
+    "from xpotato.models.utils import to_dot\n",
+    "from graphviz import Source\n",
+    "\n",
+    "Source(to_dot(graphs[0]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "3112e624",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define the featureset\n",
+    "feature1 = ['(u_0 / define :obl (u_1 / .*) :nsubj.* (u_2 / .*))'], [], 'DEFINED', [{'ARG1': 1, 'ARG2': 2}]\n",
+    "feature2 = ['(u_0 / subtype :nmod (u_1 / .*) :nsubj (u_2 / .*))'], [], 'SUBTYPE', [{'ARG1': 1, 'ARG2': 2}]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "847df12f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a ruleset from the features\n",
+    "from xpotato.graph_extractor.rule import RuleSet, Rule\n",
+    "\n",
+    "rule_set = RuleSet([Rule(feature1, openie=True), Rule(feature2, openie=True)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "c7fd16bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Annotate sentence with triplets\n",
+    "triplets = list(evaluator.annotate(graphs[0], rule_set.to_list()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "620e73ac",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'relation': 'DEFINED', 'ARG1': 'ComponentType', 'ARG2': 'property'}]"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "triplets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4bd8ed4b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "base"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 2cb6d295e40c1a397fb75394326a0a864a13da11 Mon Sep 17 00:00:00 2001
From: Adam Kovacs <adaam.ko@gmail.com>
Date: Tue, 8 Mar 2022 14:33:17 +0100
Subject: [PATCH 14/15] POTATO release v0.1.0

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6837559..b860251 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name="xpotato",
-    version="0.0.9",
+    version="0.1.0",
     description="XAI human-in-the-loop information extraction framework",
     long_description=open("README.md", "r", encoding="utf-8").read(),
     long_description_content_type="text/markdown",

From 1bc65007d276f0a4c9c8e18a7bc91fe78b00e644 Mon Sep 17 00:00:00 2001
From: Adam Kovacs <adaam.ko@gmail.com>
Date: Wed, 9 Mar 2022 10:29:06 +0100
Subject: [PATCH 15/15] Typo in README

---
 features/hasoc/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/features/hasoc/README.md b/features/hasoc/README.md
index 7f3323b..78b3fed 100644
--- a/features/hasoc/README.md
+++ b/features/hasoc/README.md
@@ -15,7 +15,7 @@ Prebuilt rule-systems are available in this directory for the _2019, 2020, 2021_
 Then the frontend of POTATO can be started from the __frontend__ directory:
 
 ```bash
-streamlit run app.py -- -t ../features/hasoc/hasoc_2021_train_amrtsv -v ../features/hasoc/hasoc_2021_val_amr.tsv -hr ../features/hasoc/2021_train_features_task1.json
+streamlit run app.py -- -t ../features/hasoc/hasoc_2021_train_amr.tsv -v ../features/hasoc/hasoc_2021_val_amr.tsv -hr ../features/hasoc/2021_train_features_task1.json
 ```
 
 If you want to reproduce our output run _evaluate.py_ from the _scripts_ directory.