-
Notifications
You must be signed in to change notification settings - Fork 0
/
Predicting Housing Price HTML.Rmd
108 lines (96 loc) · 3.6 KB
/
Predicting Housing Price HTML.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
---
title: "Predicting Ames, Iowa Housing Price"
author: "Alex Navarro"
date: "10/27/2021"
output:
html_document:
df_print: paged
pdf_document: default
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
# Load Packages:
library(MASS)
library(ggplot2)
library(markdown)
library(knitr)
library(RColorBrewer)
# Read data into system:
data <- read.csv("AmesHousing.csv")
```
# Exploratory Data Analysis
```{r, echo=TRUE}
# Set Seed to Save Models:
set.seed(1)
# check head of dataframe:
head(data, n = 10)
# Look at structure of dataframe:
str(data)
# Look at missing values in dataframe:
summary(data)
```
```{r, echo=TRUE}
# Remove Missing Values from Data:
data2= na.omit(data)
# check structure of dataframe:
str(data2)
# Split Training Set 70/30
train <- sample(2258,1800)
test <- (c(1:2258)[-train])
# Create a data frame with continuous variables only:
num.ames=data.frame(data2[,c(2,3,14:17,23,31,33:35,40:49,51,53,56,58,59,63:68,70,71,74)])
# Checking Data Correlation and Distribution:
plot(SalePrice ~., data = num.ames, subset = train)
```
# Modeling:
```{r, echo=TRUE}
# Create First Model:
fit <- lm(SalePrice ~ Overall.Qual + Year.Built + Year.Remod.Add + BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area + TotRms.AbvGrd +
Garage.Yr.Blt + Wood.Deck.SF + Open.Porch.SF, data = num.ames, subset = train)
# Return model summary of first model:
summary(fit)
# Create Second Model:
fit2 <- lm(SalePrice ~ Overall.Qual + Year.Built + Year.Remod.Add + BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area + TotRms.AbvGrd +
Garage.Yr.Blt + Wood.Deck.SF, data = num.ames, subset = train)
# Return model summary of second model:
summary(fit2)
# Create Third Model:
fit3 <- lm(SalePrice ~ Overall.Qual + Year.Built + Year.Remod.Add + BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area + Garage.Yr.Blt + Wood.Deck.SF, data = num.ames, subset = train)
# Return model summary of third model:
summary(fit3)
```
# Check Model Diagnostics:
```{r, cache=TRUE}
# Plot Fitted vs Residuals:
plot(fit3$res~fit3$fitted, main = "Fitted vs Residuals")
# Check normality of model:
hist(fit3$res, main = "Normality Test",
col = c("blue", "red", "green"))
# Plot qq-plot:
qqnorm((fit3$res))
# add reference line:
qqline(fit3$res)
# Compute Shapiro-Wilk Test for Normality Check:
shapiro.test(fit3$res)
```
# Boxcox Transformation:
```{r, cache=TRUE}
# Run boxcox transformation to help normalize data:
boxcox(SalePrice~Overall.Qual + Year.Built + Year.Remod.Add + BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area + TotRms.AbvGrd +
Garage.Yr.Blt + Wood.Deck.SF, data = num.ames)
# Create new variable that is the log of SalesPrice:
SalePriceLog <- log(num.ames$SalePrice)
# Create new model using SalesPriceLog for the dependent variable:
fit4 <- lm(SalePriceLog~Overall.Qual + Year.Built + Year.Remod.Add + BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area + TotRms.AbvGrd +
Garage.Yr.Blt + Wood.Deck.SF, data = num.ames)
# Plot Fitted vs Residual Values:
plot(fit4$res~fit4$fitted, main = "Diagnostic Check Model 4")
```
# Determine Categorical Variables For Model:
```{r,cache=TRUE}
# Using anova: determine the categorical variables to use in the final model:
model1 = lm(SalePriceLog ~ Overall.Qual + Year.Built + Year.Remod.Add + BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area + Wood.Deck.SF + Open.Porch.SF, data = data2, subset = train)
model2 = lm(SalePriceLog ~ Overall.Qual + Year.Built + Year.Remod.Add + BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area + Wood.Deck.SF + Open.Porch.SF + Street, data = data2, subset = train)
anova(model1,model2)
str(data2)
```