forked from NEASQC/WP6_QNLP
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
12,448 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
Test accuracy for run 0: 0.6586466165413534 | ||
Test accuracy for run 1: 0.6596491228070176 | ||
Test accuracy for run 2: 0.6446115288220552 | ||
Test accuracy for run 3: 0.6827067669172933 | ||
Test accuracy for run 4: 0.6741854636591479 | ||
Test accuracy for run 5: 0.6676691729323309 | ||
Test accuracy for run 6: 0.6581453634085213 | ||
Test accuracy for run 7: 0.6481203007518797 | ||
Test accuracy for run 8: 0.6756892230576441 | ||
Test accuracy for run 9: 0.656140350877193 | ||
Test accuracy for run 10: 0.6516290726817042 | ||
Test accuracy for run 11: 0.6606516290726817 | ||
Test accuracy for run 12: 0.6636591478696742 | ||
Test accuracy for run 13: 0.6691729323308271 | ||
Test accuracy for run 14: 0.6596491228070176 | ||
Test accuracy for run 15: 0.6661654135338346 | ||
Test accuracy for run 16: 0.6451127819548872 | ||
Test accuracy for run 17: 0.6476190476190476 | ||
Test accuracy for run 18: 0.6290726817042607 | ||
Test accuracy for run 19: 0.6716791979949874 | ||
Test accuracy for run 20: 0.6466165413533834 | ||
Test accuracy for run 21: 0.6606516290726817 | ||
Test accuracy for run 22: 0.6601503759398496 | ||
Test accuracy for run 23: 0.6781954887218045 | ||
Test accuracy for run 24: 0.66265664160401 | ||
Test accuracy for run 25: 0.6571428571428571 | ||
Test accuracy for run 26: 0.631077694235589 | ||
Test accuracy for run 27: 0.6736842105263158 | ||
Test accuracy for run 28: 0.6385964912280702 | ||
Test accuracy for run 29: 0.6636591478696742 | ||
Average test accuracy: 0.6587468671679199 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,278 @@ | ||
""" | ||
DimReduction | ||
============ | ||
Module containing the base class for performing dimensionality reduction. | ||
""" | ||
from abc import ABC, abstractmethod | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import sklearn.decomposition as skd | ||
import sklearn.manifold as skm | ||
import umap | ||
|
||
|
||
class DimReduction(ABC): | ||
""" | ||
Base class for dimensionality reduction of | ||
vectors representing sentences. | ||
""" | ||
def __init__( | ||
self, dataset : pd.DataFrame, dim_out : int | ||
)-> None: | ||
""" | ||
Initialise the dimensionality reduction class. | ||
Parameters | ||
---------- | ||
dataset : pd.DataFrame | ||
Pandas dataframe where each row corresponds | ||
to a sentence. It must contain one column named | ||
'sentence_vector', with the vector representation | ||
of each sentence. | ||
dim_out : int | ||
Desired output dimension of the vectors. | ||
""" | ||
self.dataset = dataset | ||
try: | ||
self.sentence_vectors = dataset['sentence_embedding'].to_list() | ||
except KeyError: | ||
raise ValueError('Sentence vector not present in the dataset.') | ||
self.dim_out = dim_out | ||
|
||
@abstractmethod | ||
def reduce_dimension(self)-> None: | ||
""" | ||
Fit the dataset to output vectors with the desired dimension. | ||
""" | ||
|
||
def save_dataset( | ||
self, filename :str, | ||
dataset_path : str)-> None: | ||
""" | ||
Save the reduced dataset in a given path. | ||
Parameters | ||
---------- | ||
filename : str | ||
Name of the file to save to. | ||
dataset_path : str | ||
Path where to store the dataset. | ||
""" | ||
filepath =f"{dataset_path}{filename}.tsv" | ||
self.dataset.to_csv( | ||
filepath, sep='\t', index = False | ||
) | ||
|
||
|
||
class PCA(DimReduction): | ||
""" | ||
Class for principal component analysis implementation. | ||
""" | ||
def __init__( | ||
self, dataset : pd.DataFrame, dim_out : int, **kwargs | ||
)-> None: | ||
""" | ||
Initialise the PCA dimensionality reduction class. | ||
Parameters | ||
---------- | ||
dataset : pd.DataFrame | ||
Pandas dataframe where each row corresponds | ||
to a sentence. It must contain one column named | ||
'sentence_vector', with the vector representation | ||
of each sentence. | ||
dim_out : int | ||
Desired output dimension of the vectors. | ||
**kwargs | ||
Arguments passed to the sklearn.decomposition.PCA object. | ||
They can be found in | ||
https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html. | ||
""" | ||
super().__init__( | ||
dataset = dataset, dim_out = dim_out | ||
) | ||
self.pca_sk = skd.PCA(n_components=self.dim_out, **kwargs) | ||
|
||
def reduce_dimension(self)-> None: | ||
""" | ||
Fit the vectorised sentences to obtain the reduced dimension | ||
sentence vectors. | ||
""" | ||
sentence_vectors_reduced = self.pca_sk.fit_transform( | ||
self.sentence_vectors) | ||
self.dataset[ | ||
'reduced_sentence_embedding'] = sentence_vectors_reduced.tolist() | ||
|
||
|
||
class ICA(DimReduction): | ||
""" | ||
Class for the independent component analysis implementation. | ||
""" | ||
def __init__( | ||
self, dataset : pd.DataFrame, dim_out : int, **kwargs | ||
)-> None: | ||
""" | ||
Initialise the ICA dimensionality reduction class. | ||
Parameters | ||
---------- | ||
dataset : pd.DataFrame | ||
Pandas dataframe where each row corresponds | ||
to a sentence. It must contain one column named | ||
'sentence_vector', with the vector representation | ||
of each sentence. | ||
dim_out : int | ||
Desired output dimension of the vectors. | ||
**kwargs | ||
Arguments passed to the sklearn.decomposition.FastICA object. | ||
They can be found in | ||
https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.FastICA.html. | ||
""" | ||
super().__init__( | ||
dataset = dataset, dim_out = dim_out | ||
) | ||
self.ica_sk = skd.FastICA( | ||
n_components = self.dim_out, **kwargs | ||
) | ||
|
||
def reduce_dimension(self)-> None: | ||
""" | ||
Fit the vectorised sentences to obtain the reduced dimension | ||
sentence vectors. | ||
""" | ||
sentence_vectors_reduced = self.ica_sk.fit_transform( | ||
self.sentence_vectors | ||
) | ||
self.dataset[ | ||
'reduced_sentence_embedding'] = sentence_vectors_reduced.tolist() | ||
|
||
|
||
class TSVD(DimReduction): | ||
""" | ||
Class for truncated SVD dimensionality reduction. | ||
""" | ||
def __init__( | ||
self, dataset : pd.DataFrame, dim_out : int, **kwargs | ||
)-> None: | ||
""" | ||
Initialise the TSVD dimensionality reduction class. | ||
Parameters | ||
---------- | ||
dataset : pd.DataFrame | ||
Pandas dataframe where each row corresponds | ||
to a sentence. It must contain one column named | ||
'sentence_vector', with the vector representation | ||
of each sentence. | ||
dim_out : int | ||
Desired output dimension of the vectors. | ||
**kwargs | ||
Arguments passed to the sklearn.decomposition.TruncatedSVD object. | ||
They can be found in | ||
https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html. | ||
""" | ||
super().__init__( | ||
dataset = dataset, dim_out = dim_out | ||
) | ||
self.tsvd_sk = skd.TruncatedSVD( | ||
n_components = self.dim_out, **kwargs | ||
) | ||
|
||
def reduce_dimension(self)-> None: | ||
""" | ||
Fit the vectorised sentences to obtain the reduced dimension | ||
sentence vectors. | ||
""" | ||
sentence_vectors_reduced = self.tsvd_sk.fit_transform( | ||
self.sentence_vectors | ||
) | ||
self.dataset[ | ||
'reduced_sentence_embedding'] = sentence_vectors_reduced.tolist() | ||
|
||
|
||
class UMAP(DimReduction): | ||
""" | ||
Class for UMAP dimensionality reduction. | ||
""" | ||
def __init__( | ||
self, dataset : pd.DataFrame, dim_out : int, **kwargs | ||
)-> None: | ||
""" | ||
Initialise the UMAP dimensionality reduction class. | ||
Parameters | ||
---------- | ||
dataset : pd.DataFrame | ||
Pandas dataframe where each row corresponds | ||
to a sentence. It must contain one column named | ||
'sentence_vector', with the vector representation | ||
of each sentence. | ||
dim_out : int | ||
Desired output dimension of the vectors. | ||
**kwargs | ||
Arguments passed to the sklearn.decomposition.UMAP object. | ||
They can be found in | ||
https://umap-learn.readthedocs.io/en/latest/parameters.html. | ||
""" | ||
super().__init__( | ||
dataset = dataset, dim_out = dim_out | ||
) | ||
self.umap_sk = umap.UMAP( | ||
n_components = self.dim_out, **kwargs | ||
) | ||
|
||
def reduce_dimension(self)-> None: | ||
""" | ||
Fit the vectorised sentences to obtain the reduced dimension | ||
sentence vectors. | ||
""" | ||
sentence_vectors_reduced = self.umap_sk.fit_transform( | ||
self.sentence_vectors | ||
) | ||
self.dataset[ | ||
'reduced_sentence_embedding'] = sentence_vectors_reduced.tolist() | ||
|
||
|
||
class TSNE(DimReduction): | ||
""" | ||
Class for truncated TSNE dimensionality reduction. | ||
""" | ||
def __init__( | ||
self, dataset : pd.DataFrame, dim_out : int, **kwargs | ||
)-> None: | ||
""" | ||
Initialise the TSNE dimensionality reduction class. | ||
Parameters | ||
---------- | ||
dataset : pd.DataFrame | ||
Pandas dataframe where each row corresponds | ||
to a sentence. It must contain one column named | ||
'sentence_vector', with the vector representation | ||
of each sentence. | ||
dim_out : int | ||
Desired output dimension of the vectors. | ||
**kwargs | ||
Arguments passed to the sklearn.decomposition.TSNE object. | ||
They can be found in | ||
https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html. | ||
""" | ||
super().__init__( | ||
dataset = dataset, dim_out = dim_out | ||
) | ||
self.tsne_sk = skm.TSNE( | ||
n_components = self.dim_out, **kwargs) | ||
|
||
def reduce_dimension(self)-> None: | ||
""" | ||
Fit the vectorised sentences to obtain the reduced dimension | ||
sentence vectors. | ||
""" | ||
sentence_vectors_reduced = self.tsne_sk.fit_transform( | ||
np.array(self.sentence_vectors) | ||
) | ||
self.dataset[ | ||
'reduced_sentence_embedding'] = sentence_vectors_reduced.tolist() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#!/bin/env python3 | ||
|
||
import sys | ||
import argparse | ||
import json | ||
import pandas as pd | ||
import numpy as np | ||
from dim_reduction import (PCA, ICA, TSVD, UMAP, TSNE) | ||
|
||
def main(): | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-i", "--data", help = "Json data file with embeddings") | ||
parser.add_argument("-o", "--dataout", help = "Json data file with reduced embeddings") | ||
parser.add_argument("-n", "--dimout", help = "Desired output dimension of the vectors") | ||
parser.add_argument("-a", "--algorithm", help = "Dimensionality reduction algorithm - 'PCA', 'ICA', 'TSVD', 'UMAP' or 'TSNE'") | ||
args = parser.parse_args() | ||
print(args) | ||
|
||
if args.algorithm not in ['PCA', 'ICA', 'TSVD', 'UMAP', 'TSNE']: | ||
print(f"{args.algorithm} not supported.") | ||
return | ||
|
||
try: | ||
df = pd.read_json(args.data) | ||
|
||
vectors=df["sentence_vectorized"].tolist() | ||
flat_list=[x[0] for x in vectors] | ||
df["sentence_embedding"]=flat_list | ||
reduceddf=df[['class', 'sentence', 'tree', 'sentence_embedding']] | ||
reduceddf['class'] = reduceddf['class'].astype(str) | ||
if args.algorithm=='PCA': | ||
reducer=PCA(reduceddf,int(args.dimout)) | ||
elif args.algorithm=='ICA': | ||
reducer=ICA(reduceddf,int(args.dimout)) | ||
elif args.algorithm=='TSVD': | ||
reducer=TSVD(reduceddf,int(args.dimout)) | ||
elif args.algorithm=='UMAP': | ||
reducer=UMAP(reduceddf,int(args.dimout)) | ||
elif args.algorithm=='TSNE': | ||
reducer=TSNE(reduceddf,int(args.dimout)) | ||
|
||
reducer.reduce_dimension() | ||
|
||
reducer.dataset=reducer.dataset.rename(columns={"reduced_sentence_embedding": "sentence_vectorized"}) | ||
|
||
vectors=reducer.dataset["sentence_vectorized"].tolist() | ||
list_of_list=[[x] for x in vectors] | ||
reducer.dataset["sentence_vectorized"]=list_of_list | ||
|
||
dflistofdict=reducer.dataset[['class', 'sentence', 'tree', 'sentence_vectorized']].apply(lambda x: x.to_dict(), axis=1).to_list() | ||
|
||
with open(args.dataout,'w') as fout: | ||
json.dump(dflistofdict,fout, indent=2) | ||
|
||
print("Done!") | ||
|
||
except Exception as err: | ||
print(f"Unexpected {err=}") | ||
|
||
if __name__ == "__main__": | ||
sys.exit(int(main() or 0)) |
Oops, something went wrong.