-
Notifications
You must be signed in to change notification settings - Fork 0
/
IMDBCleaning.R
242 lines (194 loc) · 9.63 KB
/
IMDBCleaning.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
##
## This is all my code to analyze the IMDB database
##
## Libraries I Need
library(tidyverse)
library(DataExplorer)
## Read in the data
imdb.train <- read_csv("./IMDBTrain.csv")
imdb.test <- read_csv("./IMDBTest.csv")
## Merge the two datasets together so when I clean the
## training dataset I also treat the test set the same way
names(imdb.test)[names(imdb.test) == "Id"] <- "movie_title"
imdb <- bind_rows(train = imdb.train, test = imdb.test, .id = "Set")
####################################
## Some Exploratory Data Analysis ##
####################################
## Overall summary of the data
summary(imdb)
## Which variables have missing values?
plot_missing(imdb)
## Which variables are related to each other?
plot_correlation(imdb, type = "continuous",
cor_args = list(use = "pairwise.complete.obs"))
###########################################
## Go through and clean up the variables ##
###########################################
## Duration - only one missing so just look it up
## and fill it in
# imdb[is.na(imdb$duration),]$duration <- 116 #or
imdb <- imdb %>%
mutate(duration = replace(duration, is.na(duration), 115))
## Color - mode imputation and convert to 0/1
#imdb$color[is.na(imdb$color)] <- "Color"
imdb <- imdb %>%
mutate(color = replace(color, is.na(color), "Color"))
imdb <- imdb %>%
mutate(color = ifelse(color == "Color", 1, 0))
## num_user_for_reviews = mean imputation
imdb <- imdb %>%
mutate(num_user_for_reviews = replace(num_user_for_reviews, is.na(num_user_for_reviews),
mean(num_user_for_reviews, na.rm = TRUE)))
# Column was decided due to high correlation between num_user_for_reviews and num_voted_users, num_voted_users also had a higher correlation with imdb_score
# Median imputation can lower correlation. Consider regression imputation
## Director - convert to number of movies made by director
director_movie_count <- imdb %>%
group_by(director_name) %>%
summarise(movies_made = n())
# Joining by director name
imdb <- imdb %>%
left_join(director_movie_count) %>%
select(-director_name)
# Using categorical variables can lead to the Curse of Dimensionality, best to reduce the number of explanatory variables.
## Stochastic Regression imputation for num_critic_for_reviews
imdb <- imdb %>%
mutate(num_critic_for_reviews = replace(num_critic_for_reviews, is.na(num_critic_for_reviews),
median(num_critic_for_reviews, na.rm = TRUE)))
# Median imputation again might decrease correlation
## Language - only five missing values so we replace them
missing_languages <- c("Silent", "Silent", "Silent", "English", "Silent")
#imdb$language[is.na(imdb$language)] <- missing_languages ## or
imdb <- imdb %>%
mutate(language = replace(language, is.na(language), missing_languages))
## Lots of very small categories for language so combine into English vs. Non-English
imdb <- imdb %>%
mutate(language = fct_collapse(language, Other = unique(language[language != "English"])))
table(imdb$language)
imdb <- imdb %>%
mutate(english = ifelse(language == "English", 1, 0)) %>%
select(-language)
## Content-rating - collapse GP --> PG and create "other"
## X --> NC-17, TV-?? --> TV, M--> PG13
imdb <- imdb %>%
mutate(content_rating = fct_explicit_na(content_rating, na_level = "Not Rated")) %>%
mutate(content_rating = fct_collapse(content_rating, PG = c("M", "GP", "PG"),
NC17 = c("X", "NC-17"),
TV = c("TV-14", "TV-G", "TV-PG")))
table(imdb$content_rating)
## Genres - create dummy variables for each genre and number of genres assigned
imdb <- imdb %>% mutate(num_genre = (str_split(genres, "\\|") %>%
sapply(., FUN = length)))
dummy_genres <- imdb$genres %>% sapply(function(x){unlist(str_split(x, '\\|'))}) %>% qdapTools::mtabulate()
colnames(dummy_genres) <- paste0('Genre_', colnames(dummy_genres))
rownames(dummy_genres) <- c()
imdb <- cbind(imdb, dummy_genres)
# Now some observations only have 1 instance, this means NO variability => overfitting
# Film-Noir, News, and Short all have < 10 movies so combining these into an other category
imdb <- imdb %>% mutate(Genre_Other = `Genre_Film-Noir` + Genre_News + Genre_Short) %>%
select(-`Genre_Film-Noir`, -Genre_News, -Genre_Short)
## actor_name columns - we created num_top_actors which tell us
## how many “top” actors were in a movie. “Top” actors were actors who
## were in multiple movies. All 3 actor column were used to decide who was a top actor.
all.actors <- imdb %>% select(actor_1_name, actor_2_name, actor_3_name) %>% do.call(c, args=.)
actors.freq <- data.frame(actor = all.actors) %>% filter(!is.na(actor)) %>%
group_by(actor) %>% summarize(n = n()) %>%
arrange(desc(n))
names(actors.freq)[1] <- "actor_1_name"
names(actors.freq)[2] <- "actor_1_n_movies"
imdb <- imdb %>%
left_join(actors.freq)
names(actors.freq)[1] <- "actor_2_name"
names(actors.freq)[2] <- "actor_2_n_movies"
imdb <- imdb %>%
left_join(actors.freq)
names(actors.freq)[1] <- "actor_3_name"
names(actors.freq)[2] <- "actor_3_n_movies"
imdb <- imdb %>%
left_join(actors.freq)
imdb <- imdb %>%
mutate(actor_1_n_movies = (ifelse(is.na(actor_1_n_movies), 0, actor_1_n_movies)),
actor_2_n_movies = (ifelse(is.na(actor_2_n_movies), 0, actor_2_n_movies)),
actor_3_n_movies = (ifelse(is.na(actor_3_n_movies), 0, actor_3_n_movies)))
# top.actors <- actors.freq %>% filter(n > 10) %>% pull(actor)
# imdb <- imdb %>%
# mutate(num_top_actors = (ifelse(actor_1_name %in% top.actors, 1, 0) +
# ifelse(actor_2_name %in% top.actors, 1, 0) +
# ifelse(actor_3_name %in% top.actors, 1, 0)))
## facebook_like columns - we made a column num_pop_actors for the total number of popular actors
## in a movie based off of their Facebook likes. With this, we decided to throw
## out cast_facebook likes as it seemed repetitive. It’s pretty mute since there’s
## also actor Facebook likes
actor.likes <- imdb %>% select(actor_1_facebook_likes, actor_2_facebook_likes, actor_3_facebook_likes) %>%
do.call(c, args = .)
actors.likes <- data.frame(actor = all.actors, likes = actor.likes) %>%
filter(!is.na(actor)) %>% group_by(actor) %>% summarize(likes = max(likes)) %>%
arrange(desc(likes))
pop.actors <- actors.likes %>% filter(likes > quantile(likes, probs = 0.95)) %>%
pull(actor)
imdb <- imdb %>%
mutate(num_pop_actors = (ifelse(actor_1_name %in% pop.actors, 1, 0) +
ifelse(actor_2_name %in% pop.actors, 1, 0) +
ifelse(actor_3_name %in% pop.actors, 1, 0)))
## movie_facebook_likes - we believe it to be unreliable.
## It seems that some of the movies with 0s in the data have more
## likes than top movies in real life. Although, there are no NAs
## so not much cleaning to do there. We were considering throwing it out
# There is a somewhat strong correlation between budget and gross, so we will
# impute the budget first since budget has fewer missing values and then use the
# imputed the budget to predict the gross
## LM for Budget using only full data
# sqrt(budget) to remove negative predictions
budget.lm <- lm(sqrt(budget) ~ num_critic_for_reviews +
duration +
num_voted_users +
title_year +
num_critic_for_reviews +
movie_facebook_likes +
num_genre +
english +
aspect_ratio +
actor_1_n_movies +
actor_2_n_movies +
actor_3_n_movies +
movies_made +
num_pop_actors,
data = imdb)
budget.preds <- (predict(budget.lm, newdata = (imdb %>% filter(is.na(budget)))) +
rnorm(sum(is.na(imdb$budget)), 0, sigma(budget.lm)))^2
imdb <- imdb %>%
mutate(budget = replace(budget, is.na(budget), budget.preds))
# Regression imputation leads to the risk of inflating correlation
# Fill in budget first because there are less NAs in budget than in gross
## Stochastic reg imputation for gross
# LM plus add in some noise rnorm(sum(is.na(imdb$gross))), no bias in correlation
gross.lm <- lm(sqrt(gross) ~ num_critic_for_reviews +
duration +
num_voted_users +
title_year +
num_critic_for_reviews +
movie_facebook_likes +
num_genre +
english +
aspect_ratio +
actor_1_n_movies +
actor_2_n_movies +
actor_3_n_movies +
movies_made +
num_pop_actors +
budget,
data = imdb)
gross.preds <- (predict(gross.lm, newdata = (imdb %>% filter(is.na(gross)))) +
rnorm(sum(is.na(imdb$gross)), 0, sigma(gross.lm)))^2
imdb <- imdb %>%
mutate(gross = replace(gross, is.na(gross), gross.preds))
rm(list = c("gross.lm", "budget.lm")) # To free up RAM space
## I am going to throw out any variable we didn't use/clean
imdb <- imdb %>% select(-cast_total_facebook_likes, -movie_imdb_link, -facenumber_in_poster,
-plot_keywords, -country, -movie_facebook_likes, -director_facebook_likes,
-actor_1_name, -actor_2_name, -actor_3_name, -actor_1_facebook_likes,
-actor_2_facebook_likes, -actor_3_facebook_likes, -genres, -num_user_for_reviews)
plot_missing(imdb)
####################################
## Write out cleaned up IMDB Data ##
####################################
write_csv(x = imdb, path = "./CleanedIMDBData.csv")