From c82d65ed3512bb37e7c0873600e638e5e5270d4e Mon Sep 17 00:00:00 2001 From: "Olivie Franklova (CZ)" Date: Fri, 26 Apr 2024 20:20:14 +0200 Subject: [PATCH] Update Readme --- .github/workflows/coverage.yml | 0 README.md | 23 ++++++++++++---- functions.py => similarity/functions.py | 36 ++++--------------------- 3 files changed, 23 insertions(+), 36 deletions(-) create mode 100644 .github/workflows/coverage.yml rename functions.py => similarity/functions.py (64%) diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml new file mode 100644 index 0000000..e69de29 diff --git a/README.md b/README.md index 2780f77..7b73a1a 100644 --- a/README.md +++ b/README.md @@ -10,15 +10,24 @@ ## What is Datasets Similarity? Datasets Similarity is project -### Structure +## Structure +- **Source code** is in folder [similarity](similarity). +- **Source code for column2Vec** is in folder [column2Vec](column2Vec). +- **Tests** are in folder [test](test) +- **Data** are stored in folders [**data**](data) and [**data_validation**](data_validation). +- **Main folder** contains: folder .github, files .gitignore, CONTRIBUTING.MD, LICENSE, README.md, requirements.txt and main.py + +--- +**.github** folder contains github workflows. + +**column2Vec** folder contains all files for [column2Vec](#column2Vec) feature. +More about structure of this folder in [here](column2Vec/README.md/#structure). + **Datasets** for testing are stored in [**data**](data) and [**data_validation**](data_validation) Corresponding link, name and eventual description for each dataset is -stored in DatasetDescription.md in belonging folder. +stored in DatasetDescription.md in belonging folder ([**data**](data/DatasetDescriptin.md), [**data_validation**](data_validation/DatasetDescription.md)). Both folders contain file DataShow.md with metadata information for each dataset. -**column2Vec** folder contains all files for [column2Vec](#column2Vec) feature. -More about structure of this folder in [here](column2Vec/README.md/#structure), - ### Column2Vec ## How to run @@ -35,7 +44,11 @@ pytest types_test.py #test name to run Or you can run all the test by running this: ```bash python -m unittest + #or + pytest ``` +**Please be aware that some tests in the test_column2Vec +module may take a long time.** ## How to contribute Please see our [**Contribution Guidelines**](CONTRIBUTING.md). diff --git a/functions.py b/similarity/functions.py similarity index 64% rename from functions.py rename to similarity/functions.py index a67c78d..7e42b77 100644 --- a/functions.py +++ b/similarity/functions.py @@ -1,33 +1,8 @@ -from itertools import compress - -from similarity.DataFrameMetadata import DataFrameMetadata, CategoricalMetadata -from similarity.Types import Types -from typing import Optional - -import pandas as pd -import numpy as np +""" +This module contains helpful functions +""" import os -import re -from sentence_transformers import SentenceTransformer -import gensim.downloader as api - - -# def get_world_embedding(world): -# # takes 3-10 minutes to load -# global wv -# if not wv: -# wv = api.load('word2vec-google-news-300') -# return wv[world] - - -# sbert_model: Optional[SentenceTransformer] = None -# -# -# def get_sbert_model() -> SentenceTransformer: -# global sbert_model -# if not sbert_model: -# sbert_model = SentenceTransformer('bert-base-nli-mean-tokens') -# return sbert_model +import pandas as pd def load__csv_files_from_folder(folder: str) -> (list[pd.DataFrame], list[str]): @@ -44,6 +19,7 @@ def load__csv_files_from_folder(folder: str) -> (list[pd.DataFrame], list[str]): names.append(file.replace(".csv", "")) return data, names + def create_string_from_columns(database: list[pd.DataFrame], table_names: list[str]) -> (list[str], list[str]): """ For each column in each table in database it creates string from that column. @@ -59,5 +35,3 @@ def create_string_from_columns(database: list[pd.DataFrame], table_names: list[s str(table[column].tolist()).replace("\'", "").replace("]", "").replace("[", "")) # column to string sentences_datasets.append(name) return sentences, sentences_datasets - -