-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpipeline.py
77 lines (57 loc) · 2.41 KB
/
pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, MultiLabelBinarizer
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
"""
Preprocessing Pipeline with Custom Transformers
- there are changes
- we want to play around with out model
- if input data has changed
- we want to try different things
"""
# This class adds new features to our pipeline (just need to toggle the boolean if want to exclude)
class FeatureAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_features = True):
self.add_features = add_features
def fit(self, X, y=None): return self
def transform(self, X, y=None):
if self.add_features:
rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
population_per_household = X[:, population_ix] / X[:, household_ix]
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
# This class allows us to select entire or partial dataframes
# (i.e. we can control which attributes we want in our pipeline)
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, feature_names):
self.feature_names = feature_names
def fit(self, X, y=None): return self
def transform(self, X): return X[self.feature_names].values
# This class simply encodes labels to values between 0 and n-1
class MyLabelEncoder(TransformerMixin):
def __init__(self, *args, **kwargs):
self.encoder = LabelEncoder(*args, **kwargs)
def fit(self, x, y=0):
self.encoder.fit(x)
return self
def transform(self, x, y=0): return self.encoder.transform(x)
"""
This class converts categorical data to nominal data by some means
- text to integers
- integers to one hot vectors (encoding style: array of all 0's and one 1)
"""
class MyLabelBinarizer(TransformerMixin):
def __init__(self, *args, **kwargs):
self.encoder = LabelBinarizer(*args, **kwargs)
def fit(self, x, y=0):
self.encoder.fit(x)
return self
def transform(self, x, y=0): return self.encoder.transform(x)
# This class does the same thing but works for Mulilabel target data
class MyMultiLabelBinarizer(TransformerMixin):
def __init__(self, *args, **kwargs):
self.encoder = MultiLabelBinarizer(*args, **kwargs)
def fit(self, x, y=0):
self.encoder.fit(x)
return self
def transform(self, x, y=0): return self.encoder.transform(x)