-
Notifications
You must be signed in to change notification settings - Fork 0
/
11_spot_checking_classification.py
76 lines (58 loc) · 2.98 KB
/
11_spot_checking_classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# Spot-checking is a way of discovering which algorithms
# perform well on your machine learning problem.
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
# ----------------------- Linear Algorithms -------------------------
# Logistic regression assumes a Gaussian distribution
# for the numeric input variables and can model binary classification problems.
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed, shuffle=True)
model = LogisticRegression()
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())
# Linear Discriminant Analysis or LDA is a statistical technique for binary and
# multi-class classification. It too assumes a Gaussian distribution for the
# numerical input variables.
model = LinearDiscriminantAnalysis()
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())
# ----------------------- Non Linear Algorithms -------------------------
# K-Nearest Neighbors (or KNN) uses a distance metric to find the K most
# similar instances in the training data for a new instance and takes the
# mean outcome of the neighbors as the prediction.
model = KNeighborsClassifier()
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())
# Naive Bayes calculates the probability of each class and the conditional
# probability of each class given each input value. These probabilities are
# estimated for new data and multiplied together, assuming that they are
# all independent (a simple or naive assumption).
model = GaussianNB()
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())
# Classification and Regression Trees (CART or just decision trees) construct
# a binary tree from the training data. Split points are chosen greedily by
# evaluating each attribute and each value of each attribute in the training
# data in order to minimize a cost function (like Gini).
model = DecisionTreeClassifier()
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())
# Support Vector Machines (or SVM) seek a line that best separates two classes.
# Those data instances that are closest to the line that best separates the
# classes are called support vectors and influence where the line is placed.
# SVM has been extended to support multiple classes.
model = SVC()
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())