-
Notifications
You must be signed in to change notification settings - Fork 0
/
IMDB.R
129 lines (101 loc) · 5.05 KB
/
IMDB.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# Necessary libraries
library(tidyverse)
library(caret)
library(xgboost)
imdb.clean <- read_csv('CleanedIMDBData.csv') %>% mutate_at(vars(movie_title, language, content_rating), factor)
# IVTrans <- dummyVars(imdb_score ~ . -movie_title -Set, data = imdb.clean)
# imdb.iv <- predict(IVTrans, newdata = imdb.clean) %>% as.data.frame() %>% bind_cols(., imdb.clean %>% select(movie_title, Set, imdb_score))
# pcTrans <- preProcess(x = imdb.clean %>% select(-imdb_score), method = 'pca')
# imdb.pca <- predict(pcTrans, newdata = imdb.clean)
# plot_correlation(imdb.pca, type = 'continuous', cor_args = list(use = 'pairwise.'))
# Center and Scaling
# trans.cs <- preProcess(x = imdb %>% select(-imdb_score), method = c('center', 'scale'))
# imdb.cs <- predict(trans.cs, newdata = imdb)
# Use one or the other
# trans.01 <- preProcess(x = imdb.clean %>% select(-imdb_score), method = 'range', rangeBounds = c(0, 1))
# imdb.01 <- predict(trans.01, newdata = imdb.clean)
imdb.train <- imdb.clean %>% filter(Set == 'train') %>% select(-c(Set, movie_title))
imdb.test <- imdb.clean %>% filter(Set == 'test') %>% select(-Set)
###########
## caret ##
###########
xgbTree.model <- train(imdb_score ~ .,
data = imdb.train, # Using the training set to create the model
method = 'xgbTree', # Defining the model to be k-Nearest Neighbors
trControl = trainControl(method = "cv", number = 10), # Defining the resampling procedure that will be used for the model
preProcess = c('center', 'scale', 'zv'),
tuneGrid = expand.grid(nrounds = 500, #115
max_depth = 6, #3
eta = .025, #.195
gamma = 0,
colsample_bytree = .58, #.58
min_child_weight = 1,
subsample = .6666667 #.675
),
maximize = FALSE # Ensuring that we minimize RMSE
)
xgbTree.model
caret.submission <- data.frame(Id = imdb.test %>% pull(movie_title), Predicted = predict(xgbTree.model, imdb.test))
write.csv(caret.submission, "caret-preds.csv", row.names = FALSE)
xgbLinear.model <- train(imdb_score ~ .,
data = imdb.train, # Using the training set to create the model
method = 'xgbLinear', # Defining the model to be k-Nearest Neighbors
trControl = trainControl(method = "cv", number = 10), # Defining the resampling procedure that will be used for the model
tuneGrid = expand.grid(lambda = c(.1, .01, .001),
alpha = c(.003, .05, .0001),
nrounds = 500,
eta = .025),
maximize = FALSE # Ensuring that we minimize RMSE
)
xgbLinear.model
caret.submission <- data.frame(Id = imdb.test %>% pull(movie_title), Predicted = predict(xgbTree.model, imdb.test))
write.csv(caret.submission, "caret-preds.csv", row.names = FALSE)
# The train and test data must be the exact same in terms of number of columns and order
#############
## xgboost ##
#############
# Isolating the 'Id' column for the submission file
test.id <- imdb.test %>% select(movie_title)
# Removing the column
imdb.test$movie_title <- NULL
# Isolating the response variable
train.y <- imdb.train$imdb_score
# Removing the response variable column
imdb.train$imdb_score <- NULL
imdb.test$imdb_score <- NULL
# Creating data.matrix
trainM <- data.matrix(imdb.train, rownames.force = NA)
# Creating DMarix for xgboost
dtrain <- xgb.DMatrix(data = trainM, label = train.y, missing = NaN)
watchlist <- list(trainM = dtrain)
param <- list(objective = "reg:squarederror",
booster = "gbtree",
eval_metric = "rmse",
eta = .025,
max_depth = 6,
subsample = .66667,
colsample_bytree = .55
)
clf <- xgb.cv(params = param,
data = dtrain,
nrounds = 1000,
nfold = 15,
watchlist = watchlist,
verbose = 1,
print_every_n = 10,
early_stopping_rounds = 20,
maximize = FALSE
)
xgb.model <- xgb.train(params = param,
data = dtrain,
nrounds = bestRound,
watchlist = watchlist,
verbose = 1,
maximize = FALSE
)
testM <- data.matrix(imdb.test, rownames.force = NA)
preds <- predict(xgb.model, testM)
xgboost.submission <- data.frame(Id = test.id, Predicted = preds)
names(xgboost.submission)[1] <- 'Id'
write.csv(xgboost.submission, "xgboost-preds.csv", row.names = FALSE)
usemodels::use_xgboost(imdb_score ~., data = imdb.train, prefix = 'impairment')