-
Notifications
You must be signed in to change notification settings - Fork 39
/
gbm.py
61 lines (49 loc) · 1.69 KB
/
gbm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import utils
import numpy as np
import pandas as pd
from sklearn import ensemble, model_selection, grid_search
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
print "\nCleaning up some data"
utils.clean_data(train)
utils.clean_data(test)
print "\nExtracting target and features"
print(train.shape)
target = train["Survived"].values
features = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
print "\nUse gradient boosting classifier"
# grid_search = grid_search.GridSearchCV(
# estimator = ensemble.GradientBoostingClassifier(
# learning_rate=0.001,
# min_samples_split=40,
# min_samples_leaf=1,
# max_features=2,
# max_depth=12,
# n_estimators=70,
# subsample=0.75,
# random_state=10),
# param_grid = {'n_estimators':[140, 280, 560, 1120, 4480]},
# scoring='roc_auc',
# n_jobs=4,
# iid=False,
# cv=10)
# grid_search.fit(features, target)
# print(grid_search.grid_scores_, grid_search.best_params_, grid_search.best_score_)
gbm = ensemble.GradientBoostingClassifier(
learning_rate = 0.005,
min_samples_split=40,
min_samples_leaf=1,
max_features=2,
max_depth=12,
n_estimators=1500,
subsample=0.75,
random_state=1)
gbm = gbm.fit(features, target)
print(gbm.feature_importances_)
print(gbm.score(features, target))
# scores = model_selection.cross_val_score(gbm, features, target, scoring='accuracy', cv=20)
# print scores
# print scores.mean()
test_features = test[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
prediction_gbm = gbm.predict(test_features)
utils.write_prediction(prediction_gbm, "results/gbm.csv")