-
Notifications
You must be signed in to change notification settings - Fork 0
/
Assignment 3 - CS 310.py
254 lines (205 loc) · 7.93 KB
/
Assignment 3 - CS 310.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
# Stefana Ciustea
# CS 310 - Assignment 3
# read the csv file, split the row, split the column and
# retrieve the "Adj Close" column values and store them into a dictionary
import csv
def get_adj_column(filename):
file = open(filename, 'r')
# creating dictreader object:
csv_file = csv.DictReader(file)
# creating empty list:
adj_close_strings = []
for col in csv_file:
# iterating over each col and append
# values to empty list
adj_close_strings.append(col['Adj Close'])
adj_close = []
for value in adj_close_strings:
# turning strings value into floats
adj_close.append(float(value))
# returning list
return adj_close
allprices = {
"IBM":get_adj_column('IBM.csv'),
"MSFT":get_adj_column('MSFT.csv'),
"GOOG":get_adj_column('GOOG.csv'),
"AAPL": get_adj_column('AAPL.csv'),
"AMZN":get_adj_column('AMZN.csv'),
"FB":get_adj_column('FB.csv')
}
# print(allprices)
# Compute the correlation between any two stocks:
import math
import statistics
def get_correlation_numerator(list_x, list_y):
# calculates the top half of the sample correlation coefficient equation
mean_x = statistics.mean(list_x)
mean_y = statistics.mean(list_y)
if len(list_x) != len(list_y):
raise ValueError("lists must be the same length")
sum_of_product = 0
for i in range(len(list_x)):
sum_of_product += (list_x[i]-mean_x)*(list_y[i]-mean_y)
return sum_of_product
def sum_of_correlation_squares(list_n):
# calculates part of the bottom half of the sample correlation coefficient equation for x and y
sum_of_corr_sq = 0
mean_n = statistics.mean(list_n)
for i in range(len(list_n)):
sum_of_corr_sq += (list_n[i]-mean_n)**2
return sum_of_corr_sq
def get_correlation(list_x, list_y):
# calculates the sample correlation coefficient
correlation_numerator = get_correlation_numerator(list_x, list_y)
correlation_value = (correlation_numerator)/(math.sqrt(sum_of_correlation_squares(list_x)*sum_of_correlation_squares(list_y)))
return correlation_value
# all stock pairs and their correlations from max to min:
print("My own implementation:")
import time
start_time = time.time()
print("MSFT:AMZN = " + str(get_correlation(allprices['MSFT'],allprices['AMZN'])))
print("AMZN:AAPL= " + str(get_correlation(allprices['AMZN'],allprices['AAPL'])))
print("MSFT:AAPL = " + str(get_correlation(allprices['MSFT'],allprices['AAPL'])))
print("GOOG:AMZN = " + str(get_correlation(allprices['GOOG'],allprices['AMZN'])))
print("IBM:FB = " + str(get_correlation(allprices['IBM'],allprices['FB'])))
print("GOOG:AAPL = " + str(get_correlation(allprices['GOOG'],allprices['AAPL'])))
print("GOOG:FB = " + str(get_correlation(allprices['GOOG'],allprices['FB'])))
print("MSFT:GOOG = " + str(get_correlation(allprices['MSFT'],allprices['GOOG'])))
print("IBM:GOOG = " + str(get_correlation(allprices['IBM'],allprices['GOOG'])))
print("FB:AMZN = " + str(get_correlation(allprices['FB'],allprices['AMZN'])))
print("IBM:AAPL = " + str(get_correlation(allprices['IBM'],allprices['AAPL'])))
print("FB:AAPL = " + str(get_correlation(allprices['FB'],allprices['AAPL'])))
print("IBM:AMZN = " + str(get_correlation(allprices['IBM'],allprices['AMZN'])))
print("MSFT:FB = " + str(get_correlation(allprices['MSFT'],allprices['FB'])))
print("IBM:MSFT = " + str(get_correlation(allprices['IBM'],allprices['MSFT'])))
part1 = "--- %s seconds ---" % (time.time() - start_time)
print(part1)
print("")
# MSFT:AMZN = 0.8741709140306773
# AMZN:AAPL= 0.8405047735028573
# MSFT:AAPL = 0.7910724697645043
# GOOG:AMZN = 0.6593918651991056
# IBM:FB = 0.6235315363005262
# GOOG:AAPL = 0.5831372907191038
# GOOG:FB = 0.5255643021735804
# MSFT:GOOG = 0.46331640746650576
# IBM:GOOG = 0.4224397164650116
# FB:AMZN = 0.04804592966462066
# IBM:AAPL = -0.024718866883184906
# FB:AAPL = -0.038716802228980905
# IBM:AMZN = -0.10170142015566165
# MSFT:FB = -0.29524958156416975
# IBM:MSFT = -0.4212553003530259
# --- 0.09494233131408691 seconds ---
# Part 2 - Write an one line numpy vectorized version of function correlation(x, y)
# to compute the correlation of two lists x, and y.
import numpy as np
print("Numpy implementation:")
import time
start_time = time.time()
print("MSFT:AMZN = ")
print(str(np.corrcoef(allprices['MSFT'], allprices['AMZN'])))
print("")
print("AMZN:AAPL= ")
print(str(np.corrcoef(allprices['AMZN'], allprices['AAPL'])))
print("")
print("MSFT:AAPL = ")
print(str(np.corrcoef(allprices['MSFT'], allprices['AAPL'])))
print("")
print("GOOG:AMZN = ")
print(str(np.corrcoef(allprices['GOOG'], allprices['AMZN'])))
print("")
print("IBM:FB = ")
print(str(np.corrcoef(allprices['IBM'], allprices['FB'])))
print("")
print("GOOG:AAPL = ")
print(str(np.corrcoef(allprices['GOOG'], allprices['AAPL'])))
print("")
print("GOOG:FB = ")
print(str(np.corrcoef(allprices['GOOG'], allprices['FB'])))
print("")
print("MSFT:GOOG = ")
print(str(np.corrcoef(allprices['MSFT'], allprices['GOOG'])))
print("")
print("IBM:GOOG = ")
print(str(np.corrcoef(allprices['IBM'], allprices['GOOG'])))
print("")
print("FB:AMZN = ")
print(str(np.corrcoef(allprices['FB'], allprices['AMZN'])))
print("")
print("IBM:AAPL = ")
print(str(np.corrcoef(allprices['IBM'], allprices['AAPL'])))
print("")
print("FB:AAPL = ")
print(str(np.corrcoef(allprices['FB'], allprices['AAPL'])))
print("")
print("IBM:AMZN = ")
print(str(np.corrcoef(allprices['IBM'], allprices['AMZN'])))
print("")
print("MSFT:FB = ")
print(str(np.corrcoef(allprices['MSFT'], allprices['FB'])))
print("")
print("IBM:MSFT = ")
print(str(np.corrcoef(allprices['IBM'], allprices['MSFT'])))
print("")
part2 = "--- %s seconds ---" % (time.time() - start_time)
print(part2)
print("")
# MSFT:AMZN =
# [[ 1. 0.87417091]
# [ 0.87417091 1. ]]
# AMZN:AAPL=
# [[ 1. 0.84050477]
# [ 0.84050477 1. ]]
# MSFT:AAPL =
# [[ 1. 0.79107247]
# [ 0.79107247 1. ]]
# GOOG:AMZN =
# [[ 1. 0.65939187]
# [ 0.65939187 1. ]]
# IBM:FB =
# [[ 1. 0.62353154]
# [ 0.62353154 1. ]]
# GOOG:AAPL =
# [[ 1. 0.58313729]
# [ 0.58313729 1. ]]
# GOOG:FB =
# [[ 1. 0.5255643]
# [ 0.5255643 1. ]]
# MSFT:GOOG =
# [[ 1. 0.46331641]
# [ 0.46331641 1. ]]
# IBM:GOOG =
# [[ 1. 0.42243972]
# [ 0.42243972 1. ]]
# FB:AMZN =
# [[ 1. 0.04804593]
# [ 0.04804593 1. ]]
# IBM:AAPL =
# [[ 1. -0.02471887]
# [-0.02471887 1. ]]
# FB:AAPL =
# [[ 1. -0.0387168]
# [-0.0387168 1. ]]
# IBM:AMZN =
# [[ 1. -0.10170142]
# [-0.10170142 1. ]]
# MSFT:FB =
# [[ 1. -0.29524958]
# [-0.29524958 1. ]]
# IBM:MSFT =
# [[ 1. -0.4212553]
# [-0.4212553 1. ]]
# --- 0.0552830696105957 seconds ---
# Part 3: Compare the run time of both implementations.
print("Run time of Part 1 implementation: " + str(part1))
print("Run time of Part 2 implementation: " + str(part2))
print("")
print("Run time of Part 1 implementation is longer than the run time of Part 2 implementation.")
print("This shows us that using numpy is better suited for calculating the correlation coefficient.")
print("This is likely because Numpy functions are implemented in C, and also may have better implementation. https://towardsdatascience.com/how-fast-numpy-really-is-e9111df44347#:~:text=Because%20the%20Numpy%20array%20is,leap%20in%20terms%20of%20speed.")
# Run time of Part 1 implementation: --- 0.06476306915283203 seconds ---
# Run time of Part 2 implementation: --- 0.0552830696105957 seconds ---
# Run time of Part 1 implementation is longer than the run time of Part 2 implementation.
# This shows us that using numpy is better suited for calculating the correlation coefficient.
# This is likely because Numpy functions are implemented in C, and also may have better implementation. https://towardsdatascience.com/how-fast-numpy-really-is-e9111df44347#:~:text=Because%20the%20Numpy%20array%20is,leap%20in%20terms%20of%20speed.