-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathols.py
100 lines (77 loc) · 3.68 KB
/
ols.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
"""
data preparation:
drop duplicate features that have pairs (e.g. open and closed parentheses, left & right bracket, etc.)
Create author and cluster dummies
Save author and title names (so we can describe our predictions to a user) and then drop them (since they are string data)
Scale the data to [0,1] scale by subtracting mins and then dividing by max-min
"""
"""
feature count:
original: 39
with author dummies: 9813
with cluster dummies: 9863 => 10018 => 10498
39 general punctuation / metadata features, >9,800 author binary variables, >650 clustering features
"""
"""
data = pd.read_csv("/Users/jamesledoux/Desktop/all-data.csv")
data = data.drop('>', 1)
data = data.drop('(', 1)
data = data.drop('[', 1)
data = data.drop('{', 1)
data = pd.concat([data, pd.get_dummies(data['author'])], axis=1) #now you have a binary variable for each author
data = pd.concat([data, pd.get_dummies(data['smallClusterId'], prefix="general")], axis=1) #and for each cluster
data = pd.concat([data, pd.get_dummies(data['mediumClusterId'], prefix="medium")], axis=1)
data = pd.concat([data, pd.get_dummies(data['largeClusterId'], prefix="specific")], axis=1)
data = data.drop('smallClusterId', 1)
data = data.drop('mediumClusterId', 1)
data = data.drop('largeClusterId', 1)
"""
data = pd.read_csv('/Users/jamesledoux/Desktop/ols_data.csv')
authors = data['author']
titles = data['book_name']
IDs = data['ID']
data = data.drop('ID', 1)
data = data.drop('filename', 1)
data = data.drop('author', 1)
data = data.drop('book_name', 1)
#replace NAs
data = data.fillna(value=0)
#scale the data
scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
data['ID'] = IDs #had to remove this earlier because we didn't want IDs to be scaled with the other columns
"""
some test ratings sets we came up with. our personal ratings
#aniket
id_list = [120, 43, 174, 30975, 1998, 12454, 514, 1250, 32325, 34117, 28464, 23042, 27761, 2916,4195, 2600, 11224, 12138, 12162, 12290, 12486]
user_ratings = [10, 10, 10, 2, 8, 2, 7, 9, 10, 10, 1, 8, 8, 9, 1, 3, 3, 1, 1, 1, 1]
#james
id_list = [996, 27761, 22791, 228, 150, 25344, 28464, 19211, 4217, 8800, 1400, 2009, 32325, 91, 102, 219, 11224, 2701, 32573, 10807, 26955, 23043]
user_ratings = [10, 10, 10, 8, 2, 5, 1, 2, 10, 7, 7, 3, 9,10, 9, 8, 5, 4,1, 1, 1, 9]
"""
id_list = [996, 27761, 22791, 228, 150, 25344, 28464, 19211, 4217, 8800, 1400, 2009, 32325, 91, 102, 219, 11224, 2701, 32573, 10807, 26955, 23043]
user_ratings = [10, 10, 10, 8, 2, 5, 1, 2, 10, 7, 7, 3, 9,10, 9, 8, 5, 4,1, 1, 1, 9]
train_x = data[data['ID'].isin(id_list)]
train_x['y'] = 5 #a default starting value. this will be updated to the actual ratings.
#these go in the wrong order since the id_list is not in order when creating the df. fix this by looping/updating ratings
for i in range(len(id_list)):
val = id_list[i]
train_x.loc[train_x['ID'] == val, 'y'] = user_ratings[i]
train_y = train_x['y']
train_x = train_x.drop('y', 1)
trainIDs = train_x['ID']
train_x = train_x.drop('ID', 1) #because id should not be interpted as a feature
model = LinearRegression()
model = model.fit(train_x, train_y)
data = data.drop('ID', 1) #full id list saved as IDs, use this if needed later
predicted_ratings = model.predict(data)
data['preds'] = predicted_ratings
data['book_name'] = titles
data['author'] = authors
best_recommendations = data[['book_name','author', 'preds']]
best_recommendations = best_recommendations.sort(['preds'], ascending = False)
best_recommendations.head(25)
#find a way to exclude ones in the training data. seeing books you already rated is not useful