-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.py
78 lines (49 loc) · 1.96 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from sklearn.preprocessing import LabelBinarizer
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
import logging
# Configure logging
format = "%(asctime)s: %(message)s"
logging.basicConfig(format=format, level=logging.INFO, datefmt="%H:%M:%S")
class OneHotTransformer(BaseEstimator, TransformerMixin):
# add another additional parameter, just for fun, while we are at it
def __init__(self, feature_name, feature_labels):
self.feature_name = feature_name
self.feature_labels = feature_labels
self.lb = LabelBinarizer()
def fit(self, X, y=None):
self.lb.fit(self.feature_labels)
return self
def transform(self, X, y=None):
X_ = X.copy()
transformed_column = self.lb.transform(X_[self.feature_name])
industry_one_hot = \
pd.DataFrame(data=transformed_column,
columns=[f'{self.feature_name}.' + no for no in self.feature_labels],
index=X_.index
)
# drop dummy variable
industry_one_hot = industry_one_hot.drop(f'{self.feature_name}.Unknown',
axis=1)
X_ = X_.drop(self.feature_name, axis=1)
X_ = pd.concat([X_, industry_one_hot],
join='inner',
axis=1)
return X_.astype(float)
class myPCA(BaseEstimator, TransformerMixin):
# add another additional parameter, just for fun, while we are at it
def __init__(self, n_components):
self.n_components = n_components
self.pca = PCA(n_components=n_components)
def fit(self, X, y=None):
if self.n_components != 0:
self.pca.fit(X)
logging.info(f'\nExplained variance ratio: {self.pca.explained_variance_ratio_}')
return self
def transform(self, X, y=None):
if self.n_components != 0:
X_ = X.copy()
return self.pca.transform(X_)
else:
return X