From c82d65ed3512bb37e7c0873600e638e5e5270d4e Mon Sep 17 00:00:00 2001
From: "Olivie Franklova (CZ)" <Olivie.Franklova@absa.africa>
Date: Fri, 26 Apr 2024 20:20:14 +0200
Subject: [PATCH] Update Readme

---
 .github/workflows/coverage.yml          |  0
 README.md                               | 23 ++++++++++++----
 functions.py => similarity/functions.py | 36 ++++---------------------
 3 files changed, 23 insertions(+), 36 deletions(-)
 create mode 100644 .github/workflows/coverage.yml
 rename functions.py => similarity/functions.py (64%)

diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
new file mode 100644
index 0000000..e69de29
diff --git a/README.md b/README.md
index 2780f77..7b73a1a 100644
--- a/README.md
+++ b/README.md
@@ -10,15 +10,24 @@
 
 ## What is Datasets Similarity?
 Datasets Similarity is project 
-### Structure
+## Structure
+- **Source code** is in folder [similarity](similarity).
+- **Source code for column2Vec** is in folder [column2Vec](column2Vec).
+- **Tests** are in folder [test](test)
+- **Data** are stored in folders [**data**](data) and [**data_validation**](data_validation).
+- **Main folder** contains: folder .github, files .gitignore, CONTRIBUTING.MD, LICENSE, README.md, requirements.txt and main.py
+
+---
+**.github** folder contains github workflows.
+
+**column2Vec** folder contains all files for [column2Vec](#column2Vec) feature.
+More about structure of this folder in [here](column2Vec/README.md/#structure).
+
 **Datasets** for testing are stored in [**data**](data) and [**data_validation**](data_validation)
 Corresponding link, name and eventual description for each dataset is
-stored in DatasetDescription.md in belonging folder. 
+stored in DatasetDescription.md in belonging folder ([**data**](data/DatasetDescriptin.md), [**data_validation**](data_validation/DatasetDescription.md)). 
 Both folders contain file DataShow.md with metadata information for each dataset.
 
-**column2Vec** folder contains all files for [column2Vec](#column2Vec) feature.
-More about structure of this folder in [here](column2Vec/README.md/#structure),
-
 ### Column2Vec
 ## How to run
 
@@ -35,7 +44,11 @@ pytest types_test.py #test name to run
 Or you can run all the test by running this:
 ```bash
  python -m unittest
+ #or
+ pytest
 ```
+**Please be aware that some tests in the test_column2Vec 
+module may take a long time.**
 
 ## How to contribute
 Please see our [**Contribution Guidelines**](CONTRIBUTING.md).
diff --git a/functions.py b/similarity/functions.py
similarity index 64%
rename from functions.py
rename to similarity/functions.py
index a67c78d..7e42b77 100644
--- a/functions.py
+++ b/similarity/functions.py
@@ -1,33 +1,8 @@
-from itertools import compress
-
-from similarity.DataFrameMetadata import DataFrameMetadata, CategoricalMetadata
-from similarity.Types import Types
-from typing import Optional
-
-import pandas as pd
-import numpy as np
+"""
+This module contains helpful functions
+"""
 import os
-import re
-from sentence_transformers import SentenceTransformer
-import gensim.downloader as api
-
-
-# def get_world_embedding(world):
-#     # takes 3-10 minutes to load
-#     global wv
-#     if not wv:
-#         wv = api.load('word2vec-google-news-300')
-#     return wv[world]
-
-
-# sbert_model: Optional[SentenceTransformer] = None
-#
-#
-# def get_sbert_model() -> SentenceTransformer:
-#     global sbert_model
-#     if not sbert_model:
-#         sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
-#     return sbert_model
+import pandas as pd
 
 
 def load__csv_files_from_folder(folder: str) -> (list[pd.DataFrame], list[str]):
@@ -44,6 +19,7 @@ def load__csv_files_from_folder(folder: str) -> (list[pd.DataFrame], list[str]):
             names.append(file.replace(".csv", ""))
     return data, names
 
+
 def create_string_from_columns(database: list[pd.DataFrame], table_names: list[str]) -> (list[str], list[str]):
     """
     For each column in each table in database it creates string from that column.
@@ -59,5 +35,3 @@ def create_string_from_columns(database: list[pd.DataFrame], table_names: list[s
                 str(table[column].tolist()).replace("\'", "").replace("]", "").replace("[", ""))  # column to string
             sentences_datasets.append(name)
     return sentences, sentences_datasets
-
-