forked from R3dFruitRollUp/High-Dim-TS-Medium
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dim_red.py
144 lines (108 loc) · 4.9 KB
/
dim_red.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
'''
Implements various dimensionality reduction techiques
for later classification and for data exploration
'''
import os
import numpy as np
import pandas as pd
from sklearn.decomposition import KernelPCA, PCA
from sklearn import random_projection
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import preprocessing
import matplotlib.pyplot as plt
def dim_exploration(X_train):
'''
converts original feature data to PCA and plots the explained variance
cumulative sum. This gives us an idea of how many components are needed
for other non-linear dim reduction methods such as kpca
Parameters
==========
X_train: pandas df. Original feature data, does not have to be split for supervised learning
at this stage. Data must be encoded and normalized before doing dimensionality
reduction.
Returns
==========
plot of explained variance
'''
pca = PCA().fit(X_train)
plt.figure()
cum_sum_pca = np.cumsum(pca.explained_variance_ratio_)
plt.plot(cum_sum_pca)
plt.xlabel('Number of Components')
plt.ylabel('Sum of Explaied Variance (%)') # for each component
plt.title('Explained Variance')
plt.show()
def dim_red_pca_only(X_train, num_comps, verbose=True):
'''
Reduces dimensionality of original dataset to a predefined number of
components. Only uses PCA. The efficacy of the reduction can be assesed via the classification
performance with a learner.
Parameters
==========
X_train: pandas df. Original feature data, does not have to be split for supervised learning
at this stage. Data must be encoded and normalized before doing dimensionality
reduction.
num_comps: number of dimensions to reduce original features.
Returns
==========
X_pca: reduced matrix of size (N, num_comps).
feats_rank_name: if verbose=True writes csv with the importance of the original
features in the reduction for the PCA method. We cannot achieve the correspondance
with the other nonlinear methods but PCA gives a good idea.
'''
# pca
pca = PCA(n_components=num_comps)
X_pca = pca.fit_transform(X_train)
if verbose == True:
pc_importance = pca.explained_variance_ratio_
feats_rank = np.argmax(np.abs(pca.components_),axis=1)
feats_rank_name = pd.DataFrame(X_train.columns[feats_rank].tolist())
feats_rank_name = pd.concat([feats_rank_name, pd.DataFrame(pc_importance)*100], axis=1)
feats_rank_name.columns = ['feat name', 'PCA imp weight']
feats_rank_name.to_csv('pca_feats_rank_name.csv')
return X_pca
def dim_red_comparison(X_train, y_data, num_comps, verbose=True):
'''
Reduces dimensionality of original dataset to a predefined number of
components. Different methods are used: PCA, KPCA, Random Projections, and
LDA. The efficacy of the reduction can be assesed via the classification
performance with a learner.
Parameters
==========
X_train: pandas df. Original feature data, does not have to be split for supervised learning
at this stage. Data must be encoded and normalized before doing dimensionality
reduction.
y_data: pandas df. Original label data. Must be encoded. Only used for LDA.
num_comps: number of dimensions to reduce original features.
Returns
==========
X_pca, X_kpca, X_rp, X_lda: reduced matrices of size (N, num_comps) for each of the
reduction methods.
feats_rank_name: if verbose=True writes csv with the importance of the original
features in the reduction for the PCA method. We cannot achieve the correspondance
with the other nonlinear methods but PCA gives a good idea.
'''
# pca
pca = PCA(n_components=num_comps)
X_pca = pca.fit_transform(X_train)
# kernelized pca
k_pca = KernelPCA(n_components=num_comps, kernel="rbf", fit_inverse_transform=True, gamma=10)
X_kpca = k_pca.fit_transform(X_train)
# transform back
# X_train_kpca_bck = k_pca.inverse_transform(X_kpca)
# random projections
rand_p = random_projection.GaussianRandomProjection(n_components=num_comps)
X_rp = rand_p.fit_transform(X_train)
# now do LDA (this is a supervised method for dim red)
lda = LinearDiscriminantAnalysis(n_components=num_comps)
X_lda = lda.fit(X_train, y_data).transform(X_train)
# only pca can give us the importance in the original space because it is
# a linear combination
if verbose == True:
pc_importance = pca.explained_variance_ratio_
feats_rank = np.argmax(np.abs(pca.components_),axis=1)
feats_rank_name = pd.DataFrame(X_train.columns[feats_rank].tolist())
feats_rank_name = pd.concat([feats_rank_name, pd.DataFrame(pc_importance)*100], axis=1)
feats_rank_name.columns = ['feat name', 'PCA imp weight']
feats_rank_name.to_csv('pca_feats_rank_name.csv')
return X_pca, X_kpca, X_rp, X_lda