# install packages
library(corrplot)
## corrplot 0.92 loaded
library(ggplot2)
library(reshape2)
library(liver)
## 
## Attaching package: 'liver'
## The following object is masked from 'package:base':
## 
##     transform
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
## 
##     smiths
library(leaps)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
#read data
data <- data(house)
# Check our NA
length(which(is.na(house), arr.ind=TRUE))
## [1] 0

EDA

summary(house)
##    house.age      distance.to.MRT   stores.number       latitude    
##  Min.   : 0.000   Min.   :  23.38   Min.   : 0.000   Min.   :24.93  
##  1st Qu.: 9.025   1st Qu.: 289.32   1st Qu.: 1.000   1st Qu.:24.96  
##  Median :16.100   Median : 492.23   Median : 4.000   Median :24.97  
##  Mean   :17.713   Mean   :1083.89   Mean   : 4.094   Mean   :24.97  
##  3rd Qu.:28.150   3rd Qu.:1454.28   3rd Qu.: 6.000   3rd Qu.:24.98  
##  Max.   :43.800   Max.   :6488.02   Max.   :10.000   Max.   :25.01  
##    longitude       unit.price    
##  Min.   :121.5   Min.   :  7.60  
##  1st Qu.:121.5   1st Qu.: 27.70  
##  Median :121.5   Median : 38.45  
##  Mean   :121.5   Mean   : 37.98  
##  3rd Qu.:121.5   3rd Qu.: 46.60  
##  Max.   :121.6   Max.   :117.50
# Example data frame structure
library(leaflet)

# Create a leaflet map
map <- leaflet(data = house) %>%
  addTiles()  # You can choose different tilesets with addProviderTiles() if desired

# Add markers to the map based on latitude and longitude
map <- map %>% addMarkers(
  lat = ~latitude,
  lng = ~longitude,
  label = ~unit.price,
  popup = ~paste("Median House Price: $", unit.price),
  clusterOptions = markerClusterOptions()
)

# Display the map
map
# Boxplots for each varaible
boxplot(house$house.age)

boxplot(house$distance.to.MRT)

boxplot(house$stores.number)

boxplot(house$latitude)

boxplot(house$longitude)

boxplot(house$unit.price, main="Box Plot of Unit Price")

library(ggplot2)

# Create a list of plots
plots <- list()

# Loop through each numeric variable and create a density plot with a line
for (i in 1:ncol(house)) {
  p <- ggplot(house, aes(x = house[, i])) +
    geom_density(color = "blue") +
    labs(title = colnames(house)[i], x = "Value", y = "Density")
  plots[[i]] <- p
}

# Print the plots one by one
for (i in 1:ncol(house)) {
  print(plots[[i]])
}

#Correlation matrix
correlation_matrix <- cor(house)
corrplot(correlation_matrix)

hist(house$unit.price, main="Distribution of Unit Price", xlab="Unit Price")

library(car)
## Loading required package: carData
symbox(house$unit.price, ylab = "unit price", main = "Boxplots for Each Transformations of Unit Price")

transprice <- (house$unit.price)^0.5
hist(transprice, ylab = "unit price", main="Distribution of Unit Price in Square root Transformation", xlab="Unit Price")

names(house)
## [1] "house.age"       "distance.to.MRT" "stores.number"   "latitude"       
## [5] "longitude"       "unit.price"

EDA: Check Assumption

pairs(house)

ggpairs( house )

Model:

\[ Y_{unit.price} = -580.5 -0.02*X_{house.age} -0.0004*X_{distance.to.MRT} + 0.09*X_{stores.number} + 21.83*X_{latitude}+0.365*X_{longitude}\]

model1 <- lm(unit.price^0.5 ~., data=house)
summary(model1)
## 
## Call:
## lm(formula = unit.price^0.5 ~ ., data = house)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6426 -0.3926 -0.0687  0.3552  4.4799 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -5.805e+02  4.706e+02  -1.233    0.218    
## house.age       -2.132e-02  2.955e-03  -7.216 2.63e-12 ***
## distance.to.MRT -3.782e-04  5.481e-05  -6.900 1.99e-11 ***
## stores.number    9.132e-02  1.441e-02   6.336 6.25e-10 ***
## latitude         2.183e+01  3.406e+00   6.409 4.05e-10 ***
## longitude        3.446e-01  3.724e+00   0.093    0.926    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6793 on 408 degrees of freedom
## Multiple R-squared:  0.6381, Adjusted R-squared:  0.6337 
## F-statistic: 143.9 on 5 and 408 DF,  p-value: < 2.2e-16
plot(model1)

s = summary(model1)
data.frame(s$coefficients)
##                      Estimate   Std..Error     t.value     Pr...t..
## (Intercept)     -5.804553e+02 4.706491e+02 -1.23330794 2.181710e-01
## house.age       -2.132262e-02 2.954890e-03 -7.21604710 2.629399e-12
## distance.to.MRT -3.782137e-04 5.481049e-05 -6.90038863 1.989437e-11
## stores.number    9.132277e-02 1.441392e-02  6.33573470 6.252533e-10
## latitude         2.182880e+01 3.405926e+00  6.40906402 4.047266e-10
## longitude        3.446381e-01 3.724248e+00  0.09253897 9.263153e-01
s$r.squared
## [1] 0.6381435

Remove Outliers

#Outliers location
outliers <- rstandard(model1)[rstandard(model1) < -2 | rstandard(model1) > 2] #leverage plot
matrix <- as.matrix(outliers)
rownames <- rownames(matrix)
levoutlier<-as.numeric(rownames)
length(levoutlier)
## [1] 20
outliersmore <- which(model1$fitted.values < 4.5) #residual plot
mat <- as.matrix(outliersmore)
row <- rownames(mat)
resoutlier<-as.numeric(row)
length(resoutlier)
## [1] 36
outliers <- union(levoutlier, resoutlier)
length(outliers)
## [1] 53
#New data set without the outliers
data_no_outlier <- house[-outliers,]
model2 <- lm(unit.price^0.5 ~., data=data_no_outlier)
summary(model2)
## 
## Call:
## lm(formula = unit.price^0.5 ~ ., data = data_no_outlier)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.25287 -0.28715  0.00386  0.28657  1.66668 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -1.159e+03  3.240e+02  -3.576 0.000397 ***
## house.age       -2.896e-02  2.101e-03 -13.781  < 2e-16 ***
## distance.to.MRT -5.908e-04  4.583e-05 -12.891  < 2e-16 ***
## stores.number    6.768e-02  1.035e-02   6.539 2.16e-10 ***
## latitude         2.966e+01  2.496e+00  11.884  < 2e-16 ***
## longitude        3.496e+00  2.547e+00   1.373 0.170742    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.455 on 355 degrees of freedom
## Multiple R-squared:  0.7455, Adjusted R-squared:  0.7419 
## F-statistic:   208 on 5 and 355 DF,  p-value: < 2.2e-16
plot(model2)

# Perform the Durbin-Watson test
library(car)
d = durbinWatsonTest(model2)
d
##  lag Autocorrelation D-W Statistic p-value
##    1     -0.01887703      2.024025   0.826
##  Alternative hypothesis: rho != 0
s2 <- summary(model2)
data.frame(s2$coefficients)
##                      Estimate   Std..Error    t.value     Pr...t..
## (Intercept)     -1.158639e+03 3.239736e+02  -3.576339 3.967260e-04
## house.age       -2.895541e-02 2.101116e-03 -13.780967 6.611972e-35
## distance.to.MRT -5.908531e-04 4.583443e-05 -12.891031 1.873129e-31
## stores.number    6.768070e-02 1.034982e-02   6.539313 2.155294e-10
## latitude         2.966045e+01 2.495818e+00  11.884060 1.196885e-27
## longitude        3.495557e+00 2.546659e+00   1.372605 1.707416e-01

\[ Y^{0.5}_{unit.price} = -1158.639 -0.029*X_{house.age} -0.0006*X_{distance.to.MRT} + 0.067*X_{stores.number} + 29.66*X_{latitude}+3.49*X_{longitude}\]

Homoscedasticity:

residuals <- model2$residuals
ggplot(data = data.frame(residuals = residuals), aes(x = fitted(model2), y = residuals)) +
  geom_point() +
  geom_smooth(method = "loess", se = FALSE, color = "blue") +
  labs(title = "Residuals vs. Fitted Values", x = "Fitted Values", y = "Residuals")
## `geom_smooth()` using formula = 'y ~ x'

qqnorm(residuals)

# QQ-plot
qqnorm(residuals)
qqline(residuals)

# Shapiro-Wilk test
shapiro.test(residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals
## W = 0.99652, p-value = 0.6245

The p-value is greater than significance level, so fail to reject the null hypothesis, meaning that there is strong evidence to suggest normality.

Multicollinearity:

vif(model2)
##       house.age distance.to.MRT   stores.number        latitude       longitude 
##        1.012439        1.963795        1.479819        1.131224        1.443359
vif(model1)
##       house.age distance.to.MRT   stores.number        latitude       longitude 
##        1.014249        4.282985        1.613339        1.599017        2.923881

Model Selection:

p=5
models =regsubsets(unit.price^0.5~., data =data_no_outlier, nvmax =p)
summary(models)
## Subset selection object
## Call: regsubsets.formula(unit.price^0.5 ~ ., data = data_no_outlier, 
##     nvmax = p)
## 5 Variables  (and intercept)
##                 Forced in Forced out
## house.age           FALSE      FALSE
## distance.to.MRT     FALSE      FALSE
## stores.number       FALSE      FALSE
## latitude            FALSE      FALSE
## longitude           FALSE      FALSE
## 1 subsets of each size up to 5
## Selection Algorithm: exhaustive
##          house.age distance.to.MRT stores.number latitude longitude
## 1  ( 1 ) " "       "*"             " "           " "      " "      
## 2  ( 1 ) "*"       "*"             " "           " "      " "      
## 3  ( 1 ) "*"       "*"             " "           "*"      " "      
## 4  ( 1 ) "*"       "*"             "*"           "*"      " "      
## 5  ( 1 ) "*"       "*"             "*"           "*"      "*"
modelss <- summary(models)
data.frame(modelss$outmat)
##          house.age distance.to.MRT stores.number latitude longitude
## 1  ( 1 )                         *                                 
## 2  ( 1 )         *               *                                 
## 3  ( 1 )         *               *                      *          
## 4  ( 1 )         *               *             *        *          
## 5  ( 1 )         *               *             *        *         *
m1 <- lm(unit.price^0.5~distance.to.MRT, data =data_no_outlier)
m2 <- lm(unit.price^0.5~house.age+distance.to.MRT, data =data_no_outlier)
m3 <- lm(unit.price^0.5~house.age+distance.to.MRT+latitude, data =data_no_outlier)
m4 <- lm(unit.price^0.5~house.age+distance.to.MRT+stores.number+latitude, data=data_no_outlier)
m5 <- lm(unit.price^0.5~house.age+distance.to.MRT+stores.number+latitude+longitude, data =data_no_outlier)
# Create a vector of model names
model_names <- c("m1", "m2", "m3", "m4", "m5")

# Create an empty data frame to store AIC values
aic_data <- data.frame(Model = character(length(model_names)), AIC = numeric(length(model_names)))

# Calculate and store AIC values for each model
for (i in 1:length(model_names)) {
  model <- get(model_names[i])  # Get the model by name
  aic_value <- AIC(model)
  aic_data[i, ] <- c(model_names[i], aic_value)
}

# Display the table of AIC values
print(aic_data)
##   Model              AIC
## 1    m1 714.114574310383
## 2    m2 627.925936238496
## 3    m3  502.27481418991
## 4    m4 463.809226197423
## 5    m5 463.898403702297
result.sum = summary(models)
criteria <- data.frame(Nvar = 1:(p),
                       R2 = result.sum$rsq,
                       R2adj = result.sum$adjr2,
                       CP = result.sum$cp,
                       BIC = result.sum$bic)
criteria <- cbind(criteria, AIC = as.numeric(aic_data$AIC))
print(criteria)
##   Nvar        R2     R2adj         CP       BIC      AIC
## 1    1 0.4796386 0.4781891 368.889675 -224.0389 714.1146
## 2    2 0.5924215 0.5901445 213.560759 -306.3387 627.9259
## 3    3 0.7138176 0.7114127  46.216537 -428.1009 502.2748
## 4    4 0.7441641 0.7412895   5.884046 -462.6776 463.8092
## 5    5 0.7455146 0.7419303   6.000000 -458.6996 463.8984
ggplot(data = criteria, aes(x = Nvar)) +
  geom_line(aes(y = R2), color = "red") +
  labs(title = "R2")