-
Notifications
You must be signed in to change notification settings - Fork 0
/
log reg.py
140 lines (115 loc) · 4.93 KB
/
log reg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
__author__ = 'FireJohnny'
#################################################
# logRegression: Logistic Regression
# Author : zouxy
# Date : 2014-03-02
# HomePage : http://blog.csdn.net/zouxy09
# Email : [email protected]
#################################################
from numpy import *
import matplotlib.pyplot as plt
import time
# calculate the sigmoid function
def sigmoid(inX):
return 1.0 / (1 + exp(-inX))
# train a logistic regression model using some optional optimize algorithm
# input: train_x is a mat datatype, each row stands for one sample
# train_y is mat datatype too, each row is the corresponding label
# opts is optimize option include step and maximum number of iterations
def trainLogRegres(train_x, train_y, opts):
# calculate training time
startTime = time.time()
numSamples, numFeatures = shape(train_x)
alpha = opts['alpha']; maxIter = opts['maxIter']
weights = ones((numFeatures, 1))
# optimize through gradient descent algorilthm
for k in range(maxIter):
if opts['optimizeType'] == 'gradDescent': # gradient descent algorilthm
output = sigmoid(train_x * weights)
error = train_y - output
weights = weights + alpha * train_x.transpose() * error
elif opts['optimizeType'] == 'stocGradDescent': # stochastic gradient descent
for i in range(numSamples):
output = sigmoid(train_x[i, :] * weights)
error = train_y[i, 0] - output
weights = weights + alpha * train_x[i, :].transpose() * error
elif opts['optimizeType'] == 'smoothStocGradDescent': # smooth stochastic gradient descent
# randomly select samples to optimize for reducing cycle fluctuations
dataIndex = range(numSamples)
for i in range(numSamples):
alpha = 4.0 / (1.0 + k + i) + 0.01
randIndex = int(random.uniform(0, len(dataIndex)))
output = sigmoid(train_x[randIndex, :] * weights)
error = train_y[randIndex, 0] - output
weights = weights + alpha * train_x[randIndex, :].transpose() * error
del(dataIndex[randIndex]) # during one interation, delete the optimized sample
else:
raise NameError('Not support optimize method type!')
print 'Congratulations, training complete! Took %fs!' % (time.time() - startTime)
return weights
# test your trained Logistic Regression model given test set
def testLogRegres(weights, test_x, test_y):
numSamples, numFeatures = shape(test_x)
matchCount = 0
for i in xrange(numSamples):
predict = sigmoid(test_x[i, :] * weights)[0, 0] > 0.5
if predict == bool(test_y[i, 0]):
matchCount += 1
accuracy = float(matchCount) / numSamples
return accuracy
# show your trained logistic regression model only available with 2-D data
def showLogRegres(weights, train_x, train_y):
# notice: train_x and train_y is mat datatype
numSamples, numFeatures = shape(train_x)
if numFeatures != 3:
print "Sorry! I can not draw because the dimension of your data is not 2!"
return 1
# draw all samples
for i in xrange(numSamples):
if int(train_y[i, 0]) == 0:
plt.plot(train_x[i, 1], train_x[i, 2], 'or')
elif int(train_y[i, 0]) == 1:
plt.plot(train_x[i, 1], train_x[i, 2], 'ob')
# draw the classify line
min_x = min(train_x[:, 1])[0, 0]
max_x = max(train_x[:, 1])[0, 0]
weights = weights.getA() # convert mat to array
y_min_x = float(-weights[0] - weights[1] * min_x) / weights[2]
y_max_x = float(-weights[0] - weights[1] * max_x) / weights[2]
plt.plot([min_x, max_x], [y_min_x, y_max_x], '-g')
plt.xlabel('X1'); plt.ylabel('X2')
plt.show()
#################################################
# logRegression: Logistic Regression
# Author : zouxy
# Date : 2014-03-02
# HomePage : http://blog.csdn.net/zouxy09
# Email : [email protected]
#################################################
from numpy import *
import matplotlib.pyplot as plt
import time
def loadData():
train_x = []
train_y = []
fileIn = open('testSet.txt')
for line in fileIn.readlines():
lineArr = line.strip().split()
train_x.append([1.0, float(lineArr[0]), float(lineArr[1])])
train_y.append(float(lineArr[2]))
return mat(train_x), mat(train_y).transpose()
## step 1: load data
print "step 1: load data..."
train_x, train_y = loadData()
test_x = train_x; test_y = train_y
## step 2: training...
print "step 2: training..."
opts = {'alpha': 0.01, 'maxIter': 100, 'optimizeType': 'smoothStocGradDescent'}
optimalWeights = trainLogRegres(train_x, train_y, opts)
## step 3: testing
print "step 3: testing..."
accuracy = testLogRegres(optimalWeights, test_x, test_y)
## step 4: show the result
print "step 4: show the result..."
print 'The classify accuracy is: %.3f%%' % (accuracy * 100)
showLogRegres(optimalWeights, train_x, train_y)