# install packages
library(corrplot)
## corrplot 0.92 loaded
library(ggplot2)
library(reshape2)
library(liver)
##
## Attaching package: 'liver'
## The following object is masked from 'package:base':
##
## transform
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
library(leaps)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
#read data
data <- data(house)
# Check our NA
length(which(is.na(house), arr.ind=TRUE))
## [1] 0
summary(house)
## house.age distance.to.MRT stores.number latitude
## Min. : 0.000 Min. : 23.38 Min. : 0.000 Min. :24.93
## 1st Qu.: 9.025 1st Qu.: 289.32 1st Qu.: 1.000 1st Qu.:24.96
## Median :16.100 Median : 492.23 Median : 4.000 Median :24.97
## Mean :17.713 Mean :1083.89 Mean : 4.094 Mean :24.97
## 3rd Qu.:28.150 3rd Qu.:1454.28 3rd Qu.: 6.000 3rd Qu.:24.98
## Max. :43.800 Max. :6488.02 Max. :10.000 Max. :25.01
## longitude unit.price
## Min. :121.5 Min. : 7.60
## 1st Qu.:121.5 1st Qu.: 27.70
## Median :121.5 Median : 38.45
## Mean :121.5 Mean : 37.98
## 3rd Qu.:121.5 3rd Qu.: 46.60
## Max. :121.6 Max. :117.50
# Example data frame structure
library(leaflet)
# Create a leaflet map
map <- leaflet(data = house) %>%
addTiles() # You can choose different tilesets with addProviderTiles() if desired
# Add markers to the map based on latitude and longitude
map <- map %>% addMarkers(
lat = ~latitude,
lng = ~longitude,
label = ~unit.price,
popup = ~paste("Median House Price: $", unit.price),
clusterOptions = markerClusterOptions()
)
# Display the map
map
# Boxplots for each varaible
boxplot(house$house.age)
boxplot(house$distance.to.MRT)
boxplot(house$stores.number)
boxplot(house$latitude)
boxplot(house$longitude)
boxplot(house$unit.price, main="Box Plot of Unit Price")
library(ggplot2)
# Create a list of plots
plots <- list()
# Loop through each numeric variable and create a density plot with a line
for (i in 1:ncol(house)) {
p <- ggplot(house, aes(x = house[, i])) +
geom_density(color = "blue") +
labs(title = colnames(house)[i], x = "Value", y = "Density")
plots[[i]] <- p
}
# Print the plots one by one
for (i in 1:ncol(house)) {
print(plots[[i]])
}
#Correlation matrix
correlation_matrix <- cor(house)
corrplot(correlation_matrix)
hist(house$unit.price, main="Distribution of Unit Price", xlab="Unit Price")
library(car)
## Loading required package: carData
symbox(house$unit.price, ylab = "unit price", main = "Boxplots for Each Transformations of Unit Price")
transprice <- (house$unit.price)^0.5
hist(transprice, ylab = "unit price", main="Distribution of Unit Price in Square root Transformation", xlab="Unit Price")
names(house)
## [1] "house.age" "distance.to.MRT" "stores.number" "latitude"
## [5] "longitude" "unit.price"
pairs(house)
ggpairs( house )
\[ Y_{unit.price} = -580.5 -0.02*X_{house.age} -0.0004*X_{distance.to.MRT} + 0.09*X_{stores.number} + 21.83*X_{latitude}+0.365*X_{longitude}\]
model1 <- lm(unit.price^0.5 ~., data=house)
summary(model1)
##
## Call:
## lm(formula = unit.price^0.5 ~ ., data = house)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6426 -0.3926 -0.0687 0.3552 4.4799
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.805e+02 4.706e+02 -1.233 0.218
## house.age -2.132e-02 2.955e-03 -7.216 2.63e-12 ***
## distance.to.MRT -3.782e-04 5.481e-05 -6.900 1.99e-11 ***
## stores.number 9.132e-02 1.441e-02 6.336 6.25e-10 ***
## latitude 2.183e+01 3.406e+00 6.409 4.05e-10 ***
## longitude 3.446e-01 3.724e+00 0.093 0.926
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6793 on 408 degrees of freedom
## Multiple R-squared: 0.6381, Adjusted R-squared: 0.6337
## F-statistic: 143.9 on 5 and 408 DF, p-value: < 2.2e-16
plot(model1)
s = summary(model1)
data.frame(s$coefficients)
## Estimate Std..Error t.value Pr...t..
## (Intercept) -5.804553e+02 4.706491e+02 -1.23330794 2.181710e-01
## house.age -2.132262e-02 2.954890e-03 -7.21604710 2.629399e-12
## distance.to.MRT -3.782137e-04 5.481049e-05 -6.90038863 1.989437e-11
## stores.number 9.132277e-02 1.441392e-02 6.33573470 6.252533e-10
## latitude 2.182880e+01 3.405926e+00 6.40906402 4.047266e-10
## longitude 3.446381e-01 3.724248e+00 0.09253897 9.263153e-01
s$r.squared
## [1] 0.6381435
#Outliers location
outliers <- rstandard(model1)[rstandard(model1) < -2 | rstandard(model1) > 2] #leverage plot
matrix <- as.matrix(outliers)
rownames <- rownames(matrix)
levoutlier<-as.numeric(rownames)
length(levoutlier)
## [1] 20
outliersmore <- which(model1$fitted.values < 4.5) #residual plot
mat <- as.matrix(outliersmore)
row <- rownames(mat)
resoutlier<-as.numeric(row)
length(resoutlier)
## [1] 36
outliers <- union(levoutlier, resoutlier)
length(outliers)
## [1] 53
#New data set without the outliers
data_no_outlier <- house[-outliers,]
model2 <- lm(unit.price^0.5 ~., data=data_no_outlier)
summary(model2)
##
## Call:
## lm(formula = unit.price^0.5 ~ ., data = data_no_outlier)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.25287 -0.28715 0.00386 0.28657 1.66668
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.159e+03 3.240e+02 -3.576 0.000397 ***
## house.age -2.896e-02 2.101e-03 -13.781 < 2e-16 ***
## distance.to.MRT -5.908e-04 4.583e-05 -12.891 < 2e-16 ***
## stores.number 6.768e-02 1.035e-02 6.539 2.16e-10 ***
## latitude 2.966e+01 2.496e+00 11.884 < 2e-16 ***
## longitude 3.496e+00 2.547e+00 1.373 0.170742
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.455 on 355 degrees of freedom
## Multiple R-squared: 0.7455, Adjusted R-squared: 0.7419
## F-statistic: 208 on 5 and 355 DF, p-value: < 2.2e-16
plot(model2)
# Perform the Durbin-Watson test
library(car)
d = durbinWatsonTest(model2)
d
## lag Autocorrelation D-W Statistic p-value
## 1 -0.01887703 2.024025 0.826
## Alternative hypothesis: rho != 0
s2 <- summary(model2)
data.frame(s2$coefficients)
## Estimate Std..Error t.value Pr...t..
## (Intercept) -1.158639e+03 3.239736e+02 -3.576339 3.967260e-04
## house.age -2.895541e-02 2.101116e-03 -13.780967 6.611972e-35
## distance.to.MRT -5.908531e-04 4.583443e-05 -12.891031 1.873129e-31
## stores.number 6.768070e-02 1.034982e-02 6.539313 2.155294e-10
## latitude 2.966045e+01 2.495818e+00 11.884060 1.196885e-27
## longitude 3.495557e+00 2.546659e+00 1.372605 1.707416e-01
\[ Y^{0.5}_{unit.price} = -1158.639 -0.029*X_{house.age} -0.0006*X_{distance.to.MRT} + 0.067*X_{stores.number} + 29.66*X_{latitude}+3.49*X_{longitude}\]
residuals <- model2$residuals
ggplot(data = data.frame(residuals = residuals), aes(x = fitted(model2), y = residuals)) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, color = "blue") +
labs(title = "Residuals vs. Fitted Values", x = "Fitted Values", y = "Residuals")
## `geom_smooth()` using formula = 'y ~ x'
# QQ-plot
qqnorm(residuals)
qqline(residuals)
# Shapiro-Wilk test
shapiro.test(residuals)
##
## Shapiro-Wilk normality test
##
## data: residuals
## W = 0.99652, p-value = 0.6245
The p-value is greater than significance level, so fail to reject the null hypothesis, meaning that there is strong evidence to suggest normality.
vif(model2)
## house.age distance.to.MRT stores.number latitude longitude
## 1.012439 1.963795 1.479819 1.131224 1.443359
vif(model1)
## house.age distance.to.MRT stores.number latitude longitude
## 1.014249 4.282985 1.613339 1.599017 2.923881
p=5
models =regsubsets(unit.price^0.5~., data =data_no_outlier, nvmax =p)
summary(models)
## Subset selection object
## Call: regsubsets.formula(unit.price^0.5 ~ ., data = data_no_outlier,
## nvmax = p)
## 5 Variables (and intercept)
## Forced in Forced out
## house.age FALSE FALSE
## distance.to.MRT FALSE FALSE
## stores.number FALSE FALSE
## latitude FALSE FALSE
## longitude FALSE FALSE
## 1 subsets of each size up to 5
## Selection Algorithm: exhaustive
## house.age distance.to.MRT stores.number latitude longitude
## 1 ( 1 ) " " "*" " " " " " "
## 2 ( 1 ) "*" "*" " " " " " "
## 3 ( 1 ) "*" "*" " " "*" " "
## 4 ( 1 ) "*" "*" "*" "*" " "
## 5 ( 1 ) "*" "*" "*" "*" "*"
modelss <- summary(models)
data.frame(modelss$outmat)
## house.age distance.to.MRT stores.number latitude longitude
## 1 ( 1 ) *
## 2 ( 1 ) * *
## 3 ( 1 ) * * *
## 4 ( 1 ) * * * *
## 5 ( 1 ) * * * * *
m1 <- lm(unit.price^0.5~distance.to.MRT, data =data_no_outlier)
m2 <- lm(unit.price^0.5~house.age+distance.to.MRT, data =data_no_outlier)
m3 <- lm(unit.price^0.5~house.age+distance.to.MRT+latitude, data =data_no_outlier)
m4 <- lm(unit.price^0.5~house.age+distance.to.MRT+stores.number+latitude, data=data_no_outlier)
m5 <- lm(unit.price^0.5~house.age+distance.to.MRT+stores.number+latitude+longitude, data =data_no_outlier)
# Create a vector of model names
model_names <- c("m1", "m2", "m3", "m4", "m5")
# Create an empty data frame to store AIC values
aic_data <- data.frame(Model = character(length(model_names)), AIC = numeric(length(model_names)))
# Calculate and store AIC values for each model
for (i in 1:length(model_names)) {
model <- get(model_names[i]) # Get the model by name
aic_value <- AIC(model)
aic_data[i, ] <- c(model_names[i], aic_value)
}
# Display the table of AIC values
print(aic_data)
## Model AIC
## 1 m1 714.114574310383
## 2 m2 627.925936238496
## 3 m3 502.27481418991
## 4 m4 463.809226197423
## 5 m5 463.898403702297
result.sum = summary(models)
criteria <- data.frame(Nvar = 1:(p),
R2 = result.sum$rsq,
R2adj = result.sum$adjr2,
CP = result.sum$cp,
BIC = result.sum$bic)
criteria <- cbind(criteria, AIC = as.numeric(aic_data$AIC))
print(criteria)
## Nvar R2 R2adj CP BIC AIC
## 1 1 0.4796386 0.4781891 368.889675 -224.0389 714.1146
## 2 2 0.5924215 0.5901445 213.560759 -306.3387 627.9259
## 3 3 0.7138176 0.7114127 46.216537 -428.1009 502.2748
## 4 4 0.7441641 0.7412895 5.884046 -462.6776 463.8092
## 5 5 0.7455146 0.7419303 6.000000 -458.6996 463.8984
ggplot(data = criteria, aes(x = Nvar)) +
geom_line(aes(y = R2), color = "red") +
labs(title = "R2")