Skip to content

Commit

Permalink
Dimension reduction
Browse files Browse the repository at this point in the history
  • Loading branch information
DaigaDe committed Feb 22, 2024
1 parent 55af2e2 commit b8cf54d
Show file tree
Hide file tree
Showing 5 changed files with 12,448 additions and 1 deletion.
31 changes: 31 additions & 0 deletions neasqc_wp61/benchmarking/results/reviews_100ember.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
Test accuracy for run 0: 0.6586466165413534
Test accuracy for run 1: 0.6596491228070176
Test accuracy for run 2: 0.6446115288220552
Test accuracy for run 3: 0.6827067669172933
Test accuracy for run 4: 0.6741854636591479
Test accuracy for run 5: 0.6676691729323309
Test accuracy for run 6: 0.6581453634085213
Test accuracy for run 7: 0.6481203007518797
Test accuracy for run 8: 0.6756892230576441
Test accuracy for run 9: 0.656140350877193
Test accuracy for run 10: 0.6516290726817042
Test accuracy for run 11: 0.6606516290726817
Test accuracy for run 12: 0.6636591478696742
Test accuracy for run 13: 0.6691729323308271
Test accuracy for run 14: 0.6596491228070176
Test accuracy for run 15: 0.6661654135338346
Test accuracy for run 16: 0.6451127819548872
Test accuracy for run 17: 0.6476190476190476
Test accuracy for run 18: 0.6290726817042607
Test accuracy for run 19: 0.6716791979949874
Test accuracy for run 20: 0.6466165413533834
Test accuracy for run 21: 0.6606516290726817
Test accuracy for run 22: 0.6601503759398496
Test accuracy for run 23: 0.6781954887218045
Test accuracy for run 24: 0.66265664160401
Test accuracy for run 25: 0.6571428571428571
Test accuracy for run 26: 0.631077694235589
Test accuracy for run 27: 0.6736842105263158
Test accuracy for run 28: 0.6385964912280702
Test accuracy for run 29: 0.6636591478696742
Average test accuracy: 0.6587468671679199
278 changes: 278 additions & 0 deletions neasqc_wp61/data/data_processing/dim_reduction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,278 @@
"""
DimReduction
============
Module containing the base class for performing dimensionality reduction.
"""
from abc import ABC, abstractmethod

import numpy as np
import pandas as pd
import sklearn.decomposition as skd
import sklearn.manifold as skm
import umap


class DimReduction(ABC):
"""
Base class for dimensionality reduction of
vectors representing sentences.
"""
def __init__(
self, dataset : pd.DataFrame, dim_out : int
)-> None:
"""
Initialise the dimensionality reduction class.
Parameters
----------
dataset : pd.DataFrame
Pandas dataframe where each row corresponds
to a sentence. It must contain one column named
'sentence_vector', with the vector representation
of each sentence.
dim_out : int
Desired output dimension of the vectors.
"""
self.dataset = dataset
try:
self.sentence_vectors = dataset['sentence_embedding'].to_list()
except KeyError:
raise ValueError('Sentence vector not present in the dataset.')
self.dim_out = dim_out

@abstractmethod
def reduce_dimension(self)-> None:
"""
Fit the dataset to output vectors with the desired dimension.
"""

def save_dataset(
self, filename :str,
dataset_path : str)-> None:
"""
Save the reduced dataset in a given path.
Parameters
----------
filename : str
Name of the file to save to.
dataset_path : str
Path where to store the dataset.
"""
filepath =f"{dataset_path}{filename}.tsv"
self.dataset.to_csv(
filepath, sep='\t', index = False
)


class PCA(DimReduction):
"""
Class for principal component analysis implementation.
"""
def __init__(
self, dataset : pd.DataFrame, dim_out : int, **kwargs
)-> None:
"""
Initialise the PCA dimensionality reduction class.
Parameters
----------
dataset : pd.DataFrame
Pandas dataframe where each row corresponds
to a sentence. It must contain one column named
'sentence_vector', with the vector representation
of each sentence.
dim_out : int
Desired output dimension of the vectors.
**kwargs
Arguments passed to the sklearn.decomposition.PCA object.
They can be found in
https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html.
"""
super().__init__(
dataset = dataset, dim_out = dim_out
)
self.pca_sk = skd.PCA(n_components=self.dim_out, **kwargs)

def reduce_dimension(self)-> None:
"""
Fit the vectorised sentences to obtain the reduced dimension
sentence vectors.
"""
sentence_vectors_reduced = self.pca_sk.fit_transform(
self.sentence_vectors)
self.dataset[
'reduced_sentence_embedding'] = sentence_vectors_reduced.tolist()


class ICA(DimReduction):
"""
Class for the independent component analysis implementation.
"""
def __init__(
self, dataset : pd.DataFrame, dim_out : int, **kwargs
)-> None:
"""
Initialise the ICA dimensionality reduction class.
Parameters
----------
dataset : pd.DataFrame
Pandas dataframe where each row corresponds
to a sentence. It must contain one column named
'sentence_vector', with the vector representation
of each sentence.
dim_out : int
Desired output dimension of the vectors.
**kwargs
Arguments passed to the sklearn.decomposition.FastICA object.
They can be found in
https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.FastICA.html.
"""
super().__init__(
dataset = dataset, dim_out = dim_out
)
self.ica_sk = skd.FastICA(
n_components = self.dim_out, **kwargs
)

def reduce_dimension(self)-> None:
"""
Fit the vectorised sentences to obtain the reduced dimension
sentence vectors.
"""
sentence_vectors_reduced = self.ica_sk.fit_transform(
self.sentence_vectors
)
self.dataset[
'reduced_sentence_embedding'] = sentence_vectors_reduced.tolist()


class TSVD(DimReduction):
"""
Class for truncated SVD dimensionality reduction.
"""
def __init__(
self, dataset : pd.DataFrame, dim_out : int, **kwargs
)-> None:
"""
Initialise the TSVD dimensionality reduction class.
Parameters
----------
dataset : pd.DataFrame
Pandas dataframe where each row corresponds
to a sentence. It must contain one column named
'sentence_vector', with the vector representation
of each sentence.
dim_out : int
Desired output dimension of the vectors.
**kwargs
Arguments passed to the sklearn.decomposition.TruncatedSVD object.
They can be found in
https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html.
"""
super().__init__(
dataset = dataset, dim_out = dim_out
)
self.tsvd_sk = skd.TruncatedSVD(
n_components = self.dim_out, **kwargs
)

def reduce_dimension(self)-> None:
"""
Fit the vectorised sentences to obtain the reduced dimension
sentence vectors.
"""
sentence_vectors_reduced = self.tsvd_sk.fit_transform(
self.sentence_vectors
)
self.dataset[
'reduced_sentence_embedding'] = sentence_vectors_reduced.tolist()


class UMAP(DimReduction):
"""
Class for UMAP dimensionality reduction.
"""
def __init__(
self, dataset : pd.DataFrame, dim_out : int, **kwargs
)-> None:
"""
Initialise the UMAP dimensionality reduction class.
Parameters
----------
dataset : pd.DataFrame
Pandas dataframe where each row corresponds
to a sentence. It must contain one column named
'sentence_vector', with the vector representation
of each sentence.
dim_out : int
Desired output dimension of the vectors.
**kwargs
Arguments passed to the sklearn.decomposition.UMAP object.
They can be found in
https://umap-learn.readthedocs.io/en/latest/parameters.html.
"""
super().__init__(
dataset = dataset, dim_out = dim_out
)
self.umap_sk = umap.UMAP(
n_components = self.dim_out, **kwargs
)

def reduce_dimension(self)-> None:
"""
Fit the vectorised sentences to obtain the reduced dimension
sentence vectors.
"""
sentence_vectors_reduced = self.umap_sk.fit_transform(
self.sentence_vectors
)
self.dataset[
'reduced_sentence_embedding'] = sentence_vectors_reduced.tolist()


class TSNE(DimReduction):
"""
Class for truncated TSNE dimensionality reduction.
"""
def __init__(
self, dataset : pd.DataFrame, dim_out : int, **kwargs
)-> None:
"""
Initialise the TSNE dimensionality reduction class.
Parameters
----------
dataset : pd.DataFrame
Pandas dataframe where each row corresponds
to a sentence. It must contain one column named
'sentence_vector', with the vector representation
of each sentence.
dim_out : int
Desired output dimension of the vectors.
**kwargs
Arguments passed to the sklearn.decomposition.TSNE object.
They can be found in
https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html.
"""
super().__init__(
dataset = dataset, dim_out = dim_out
)
self.tsne_sk = skm.TSNE(
n_components = self.dim_out, **kwargs)

def reduce_dimension(self)-> None:
"""
Fit the vectorised sentences to obtain the reduced dimension
sentence vectors.
"""
sentence_vectors_reduced = self.tsne_sk.fit_transform(
np.array(self.sentence_vectors)
)
self.dataset[
'reduced_sentence_embedding'] = sentence_vectors_reduced.tolist()

62 changes: 62 additions & 0 deletions neasqc_wp61/data/data_processing/reduce_emb_dim.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/bin/env python3

import sys
import argparse
import json
import pandas as pd
import numpy as np
from dim_reduction import (PCA, ICA, TSVD, UMAP, TSNE)

def main():

parser = argparse.ArgumentParser()
parser.add_argument("-i", "--data", help = "Json data file with embeddings")
parser.add_argument("-o", "--dataout", help = "Json data file with reduced embeddings")
parser.add_argument("-n", "--dimout", help = "Desired output dimension of the vectors")
parser.add_argument("-a", "--algorithm", help = "Dimensionality reduction algorithm - 'PCA', 'ICA', 'TSVD', 'UMAP' or 'TSNE'")
args = parser.parse_args()
print(args)

if args.algorithm not in ['PCA', 'ICA', 'TSVD', 'UMAP', 'TSNE']:
print(f"{args.algorithm} not supported.")
return

try:
df = pd.read_json(args.data)

vectors=df["sentence_vectorized"].tolist()
flat_list=[x[0] for x in vectors]
df["sentence_embedding"]=flat_list
reduceddf=df[['class', 'sentence', 'tree', 'sentence_embedding']]
reduceddf['class'] = reduceddf['class'].astype(str)
if args.algorithm=='PCA':
reducer=PCA(reduceddf,int(args.dimout))
elif args.algorithm=='ICA':
reducer=ICA(reduceddf,int(args.dimout))
elif args.algorithm=='TSVD':
reducer=TSVD(reduceddf,int(args.dimout))
elif args.algorithm=='UMAP':
reducer=UMAP(reduceddf,int(args.dimout))
elif args.algorithm=='TSNE':
reducer=TSNE(reduceddf,int(args.dimout))

reducer.reduce_dimension()

reducer.dataset=reducer.dataset.rename(columns={"reduced_sentence_embedding": "sentence_vectorized"})

vectors=reducer.dataset["sentence_vectorized"].tolist()
list_of_list=[[x] for x in vectors]
reducer.dataset["sentence_vectorized"]=list_of_list

dflistofdict=reducer.dataset[['class', 'sentence', 'tree', 'sentence_vectorized']].apply(lambda x: x.to_dict(), axis=1).to_list()

with open(args.dataout,'w') as fout:
json.dump(dflistofdict,fout, indent=2)

print("Done!")

except Exception as err:
print(f"Unexpected {err=}")

if __name__ == "__main__":
sys.exit(int(main() or 0))
Loading

0 comments on commit b8cf54d

Please sign in to comment.