Dimension reduction

tilde-nlp · Feb 22, 2024 · b8cf54d · b8cf54d
1 parent 55af2e2
commit b8cf54d
Show file tree

Hide file tree

Showing 5 changed files with 12,448 additions and 1 deletion.
diff --git a/neasqc_wp61/benchmarking/results/reviews_100ember.txt b/neasqc_wp61/benchmarking/results/reviews_100ember.txt
@@ -0,0 +1,31 @@
+Test accuracy for run 0: 0.6586466165413534
+Test accuracy for run 1: 0.6596491228070176
+Test accuracy for run 2: 0.6446115288220552
+Test accuracy for run 3: 0.6827067669172933
+Test accuracy for run 4: 0.6741854636591479
+Test accuracy for run 5: 0.6676691729323309
+Test accuracy for run 6: 0.6581453634085213
+Test accuracy for run 7: 0.6481203007518797
+Test accuracy for run 8: 0.6756892230576441
+Test accuracy for run 9: 0.656140350877193
+Test accuracy for run 10: 0.6516290726817042
+Test accuracy for run 11: 0.6606516290726817
+Test accuracy for run 12: 0.6636591478696742
+Test accuracy for run 13: 0.6691729323308271
+Test accuracy for run 14: 0.6596491228070176
+Test accuracy for run 15: 0.6661654135338346
+Test accuracy for run 16: 0.6451127819548872
+Test accuracy for run 17: 0.6476190476190476
+Test accuracy for run 18: 0.6290726817042607
+Test accuracy for run 19: 0.6716791979949874
+Test accuracy for run 20: 0.6466165413533834
+Test accuracy for run 21: 0.6606516290726817
+Test accuracy for run 22: 0.6601503759398496
+Test accuracy for run 23: 0.6781954887218045
+Test accuracy for run 24: 0.66265664160401
+Test accuracy for run 25: 0.6571428571428571
+Test accuracy for run 26: 0.631077694235589
+Test accuracy for run 27: 0.6736842105263158
+Test accuracy for run 28: 0.6385964912280702
+Test accuracy for run 29: 0.6636591478696742
+Average test accuracy: 0.6587468671679199
diff --git a/neasqc_wp61/data/data_processing/dim_reduction.py b/neasqc_wp61/data/data_processing/dim_reduction.py
@@ -0,0 +1,278 @@
+"""
+DimReduction
+============
+Module containing the base class for performing dimensionality reduction.
+
+"""
+from abc import ABC, abstractmethod
+
+import numpy as np 
+import pandas as pd
+import sklearn.decomposition as skd
+import sklearn.manifold as skm
+import umap
+
+
+class DimReduction(ABC):
+    """
+    Base class for dimensionality reduction of 
+    vectors representing sentences. 
+    """
+    def __init__(
+        self, dataset : pd.DataFrame, dim_out : int
+    )-> None:
+        """
+        Initialise the dimensionality reduction class.
+        
+        Parameters
+        ----------
+        dataset : pd.DataFrame
+            Pandas dataframe where each row corresponds
+            to a sentence. It must contain one column named
+            'sentence_vector', with the vector representation 
+            of each sentence.
+        dim_out : int 
+            Desired output dimension of the vectors.
+        """
+        self.dataset = dataset
+        try:
+            self.sentence_vectors = dataset['sentence_embedding'].to_list()
+        except KeyError:
+            raise ValueError('Sentence vector not present in the dataset.')
+        self.dim_out = dim_out
+
+    @abstractmethod
+    def reduce_dimension(self)-> None:
+        """
+        Fit the dataset to output vectors with the desired dimension.
+        """
+
+    def save_dataset(
+            self, filename :str,
+            dataset_path : str)-> None:
+        """
+        Save the reduced dataset in a given path.
+
+        Parameters
+        ----------
+        filename : str
+            Name of the file to save to.
+        dataset_path : str
+            Path where to store the dataset.
+        """
+        filepath =f"{dataset_path}{filename}.tsv"
+        self.dataset.to_csv(
+            filepath, sep='\t', index = False
+        )
+
+
+class PCA(DimReduction):
+    """
+    Class for principal component analysis implementation.
+    """
+    def __init__(
+        self, dataset : pd.DataFrame, dim_out : int, **kwargs
+    )-> None:
+        """
+        Initialise the PCA dimensionality reduction class.
+
+        Parameters
+        ----------
+        dataset : pd.DataFrame
+            Pandas dataframe where each row corresponds
+            to a sentence. It must contain one column named
+            'sentence_vector', with the vector representation 
+            of each sentence.
+        dim_out : int 
+            Desired output dimension of the vectors.
+        **kwargs 
+            Arguments passed to the sklearn.decomposition.PCA object.
+            They can be found in 
+            https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html.
+        """
+        super().__init__(
+            dataset = dataset, dim_out = dim_out
+        )
+        self.pca_sk = skd.PCA(n_components=self.dim_out, **kwargs)
+
+    def reduce_dimension(self)-> None:
+        """
+        Fit the vectorised sentences to obtain the reduced dimension
+        sentence vectors.
+        """
+        sentence_vectors_reduced = self.pca_sk.fit_transform(
+            self.sentence_vectors)
+        self.dataset[
+            'reduced_sentence_embedding'] = sentence_vectors_reduced.tolist()
+
+
+class ICA(DimReduction):
+    """
+    Class for the independent component analysis implementation.
+    """
+    def __init__(
+        self, dataset : pd.DataFrame, dim_out : int, **kwargs
+    )-> None:
+        """
+        Initialise the ICA dimensionality reduction class.
+
+        Parameters
+        ----------
+        dataset : pd.DataFrame
+            Pandas dataframe where each row corresponds
+            to a sentence. It must contain one column named
+            'sentence_vector', with the vector representation 
+            of each sentence.
+        dim_out : int 
+            Desired output dimension of the vectors.
+        **kwargs 
+            Arguments passed to the sklearn.decomposition.FastICA object.
+            They can be found in 
+            https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.FastICA.html.
+        """
+        super().__init__(
+            dataset = dataset, dim_out = dim_out
+        )
+        self.ica_sk = skd.FastICA(
+            n_components = self.dim_out, **kwargs
+        )
+
+    def reduce_dimension(self)-> None:
+        """
+        Fit the vectorised sentences to obtain the reduced dimension
+        sentence vectors.
+        """
+        sentence_vectors_reduced = self.ica_sk.fit_transform(
+            self.sentence_vectors
+        )
+        self.dataset[
+            'reduced_sentence_embedding'] = sentence_vectors_reduced.tolist()
+
+
+class TSVD(DimReduction):
+    """
+    Class for truncated SVD dimensionality reduction.
+    """
+    def __init__(
+        self, dataset : pd.DataFrame, dim_out : int, **kwargs
+    )-> None:
+        """
+        Initialise the TSVD dimensionality reduction class.
+
+        Parameters
+        ----------
+        dataset : pd.DataFrame
+            Pandas dataframe where each row corresponds
+            to a sentence. It must contain one column named
+            'sentence_vector', with the vector representation 
+            of each sentence.
+        dim_out : int 
+            Desired output dimension of the vectors.
+        **kwargs 
+            Arguments passed to the sklearn.decomposition.TruncatedSVD object.
+            They can be found in 
+            https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html.
+        """
+        super().__init__(
+            dataset = dataset, dim_out = dim_out
+        )
+        self.tsvd_sk = skd.TruncatedSVD(
+            n_components = self.dim_out, **kwargs
+        )
+
+    def reduce_dimension(self)-> None:
+        """
+        Fit the vectorised sentences to obtain the reduced dimension
+        sentence vectors.
+        """
+        sentence_vectors_reduced = self.tsvd_sk.fit_transform(
+            self.sentence_vectors
+        )
+        self.dataset[
+            'reduced_sentence_embedding'] = sentence_vectors_reduced.tolist()
+
+
+class UMAP(DimReduction):
+    """
+    Class for UMAP dimensionality reduction.
+    """
+    def __init__(
+        self, dataset : pd.DataFrame, dim_out : int, **kwargs
+    )-> None:
+        """
+        Initialise the UMAP dimensionality reduction class.
+
+        Parameters
+        ----------
+        dataset : pd.DataFrame
+            Pandas dataframe where each row corresponds
+            to a sentence. It must contain one column named
+            'sentence_vector', with the vector representation 
+            of each sentence.
+        dim_out : int 
+            Desired output dimension of the vectors.
+        **kwargs 
+            Arguments passed to the sklearn.decomposition.UMAP object.
+            They can be found in 
+            https://umap-learn.readthedocs.io/en/latest/parameters.html.
+        """
+        super().__init__(
+            dataset = dataset, dim_out = dim_out
+        )
+        self.umap_sk = umap.UMAP(
+            n_components = self.dim_out, **kwargs
+        )
+
+    def reduce_dimension(self)-> None:
+        """
+        Fit the vectorised sentences to obtain the reduced dimension
+        sentence vectors.
+        """
+        sentence_vectors_reduced = self.umap_sk.fit_transform(
+            self.sentence_vectors
+        )
+        self.dataset[
+            'reduced_sentence_embedding'] = sentence_vectors_reduced.tolist()
+
+
+class TSNE(DimReduction):
+    """
+    Class for truncated TSNE dimensionality reduction.
+    """
+    def __init__(
+        self, dataset : pd.DataFrame, dim_out : int, **kwargs
+    )-> None:
+        """
+        Initialise the TSNE dimensionality reduction class.
+            
+        Parameters
+        ----------
+        dataset : pd.DataFrame
+            Pandas dataframe where each row corresponds
+            to a sentence. It must contain one column named
+            'sentence_vector', with the vector representation 
+            of each sentence.
+        dim_out : int 
+            Desired output dimension of the vectors.
+        **kwargs 
+            Arguments passed to the sklearn.decomposition.TSNE object.
+            They can be found in 
+            https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html.
+        """
+        super().__init__(
+            dataset = dataset, dim_out = dim_out
+        )
+        self.tsne_sk = skm.TSNE(
+            n_components = self.dim_out, **kwargs)
+
+    def reduce_dimension(self)-> None:
+        """
+        Fit the vectorised sentences to obtain the reduced dimension
+        sentence vectors.
+        """
+        sentence_vectors_reduced = self.tsne_sk.fit_transform(
+            np.array(self.sentence_vectors)
+        )
+        self.dataset[
+            'reduced_sentence_embedding'] = sentence_vectors_reduced.tolist()
+
diff --git a/neasqc_wp61/data/data_processing/reduce_emb_dim.py b/neasqc_wp61/data/data_processing/reduce_emb_dim.py
@@ -0,0 +1,62 @@
+#!/bin/env python3
+
+import sys
+import argparse
+import json
+import pandas as pd
+import numpy as np
+from dim_reduction import (PCA, ICA, TSVD, UMAP, TSNE)
+
+def main():
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--data", help = "Json data file with embeddings")
+    parser.add_argument("-o", "--dataout", help = "Json data file with reduced embeddings")
+    parser.add_argument("-n", "--dimout", help = "Desired output dimension of the vectors")
+    parser.add_argument("-a", "--algorithm", help = "Dimensionality reduction algorithm - 'PCA', 'ICA', 'TSVD', 'UMAP' or 'TSNE'")
+    args = parser.parse_args()
+    print(args)
+
+    if args.algorithm not in ['PCA', 'ICA', 'TSVD', 'UMAP', 'TSNE']:
+        print(f"{args.algorithm} not supported.")
+        return 
+
+    try:
+        df = pd.read_json(args.data)
+
+        vectors=df["sentence_vectorized"].tolist()
+        flat_list=[x[0] for x in vectors]
+        df["sentence_embedding"]=flat_list
+        reduceddf=df[['class', 'sentence', 'tree', 'sentence_embedding']]
+        reduceddf['class'] = reduceddf['class'].astype(str)
+        if args.algorithm=='PCA':
+            reducer=PCA(reduceddf,int(args.dimout))
+        elif args.algorithm=='ICA':
+            reducer=ICA(reduceddf,int(args.dimout))
+        elif args.algorithm=='TSVD':
+            reducer=TSVD(reduceddf,int(args.dimout))
+        elif args.algorithm=='UMAP':
+            reducer=UMAP(reduceddf,int(args.dimout))
+        elif args.algorithm=='TSNE':
+            reducer=TSNE(reduceddf,int(args.dimout))
+
+        reducer.reduce_dimension()
+
+        reducer.dataset=reducer.dataset.rename(columns={"reduced_sentence_embedding": "sentence_vectorized"})
+
+        vectors=reducer.dataset["sentence_vectorized"].tolist()
+        list_of_list=[[x] for x in vectors]
+        reducer.dataset["sentence_vectorized"]=list_of_list
+
+        dflistofdict=reducer.dataset[['class', 'sentence', 'tree', 'sentence_vectorized']].apply(lambda x: x.to_dict(), axis=1).to_list()
+
+        with open(args.dataout,'w') as fout:
+            json.dump(dflistofdict,fout, indent=2)
+
+        print("Done!")
+
+    except Exception as err:
+        print(f"Unexpected {err=}")
+
+if __name__ == "__main__":
+    sys.exit(int(main() or 0))