1 Load packages

library(tidyverse)
library(broom)

2 Load dataset

house <- read_csv("house.csv")
attach(house)

2.1 Overview of the dataset

glimpse(house)
Rows: 200
Columns: 2
$ price     <dbl> 68.92162, 63.97395, 69.51606, 68.86582, 69.24784, 68.54808,…
$ houseSize <dbl> 2503.489, 2362.630, 2522.676, 2491.663, 2511.292, 2476.032,…
summary(house)
     price         houseSize   
 Min.   :59.89   Min.   :2208  
 1st Qu.:65.79   1st Qu.:2437  
 Median :67.28   Median :2504  
 Mean   :67.45   Mean   :2502  
 3rd Qu.:69.27   3rd Qu.:2579  
 Max.   :74.02   Max.   :2769  

3 Visualise data

qplot(data=house, x=price, geom=c("histogram"))+
  geom_histogram(color="black", fill="#d95f02")

Interpretation:



qplot(data=house, x=houseSize, geom=c("histogram"))+
  geom_histogram(color="black", fill="#1b9e77")

Interpretation:



qplot(data=house, y=price, x=houseSize) + 
  xlab("House size (sqft)") + 
  ylab("Price") + 
  labs(title="Price vs House size (sqft)")

cor(house$price, house$houseSize)
[1] 0.9331807

Interpretation:



4 Fit a regression model

slrfit <- lm(price ~ houseSize, data=house)
slrfit

Call:
lm(formula = price ~ houseSize, data = house)

Coefficients:
(Intercept)    houseSize  
    5.15772      0.02489  
qplot(data=house, y=price, x=houseSize) + 
  xlab("House size (sqft)") + 
  ylab("Price") + 
  labs(title="Price vs House size (sqft)") + 
   geom_abline(intercept = 5.15772, slope = 0.02489, colour="forestgreen", lwd=2) 

Write the fitted regression model:



5 Model adequacy checking

5.1 Compute fitted values and residuals

slrfit_values <- augment(slrfit)
slrfit_values
# A tibble: 200 x 9
   price houseSize .fitted .se.fit   .resid    .hat .sigma    .cooksd .std.resid
   <dbl>     <dbl>   <dbl>   <dbl>    <dbl>   <dbl>  <dbl>      <dbl>      <dbl>
 1  68.9     2503.    67.5  0.0692  1.45    0.00500  0.975    5.52e-3    1.48   
 2  64.0     2363.    64.0  0.118   0.00442 0.0145   0.981    1.52e-7    0.00455
 3  69.5     2523.    68.0  0.0705  1.56    0.00520  0.974    6.70e-3    1.60   
 4  68.9     2492.    67.2  0.0696  1.68    0.00506  0.973    7.57e-3    1.73   
 5  69.2     2511.    67.7  0.0694  1.58    0.00504  0.974    6.62e-3    1.62   
 6  68.5     2476.    66.8  0.0715  1.76    0.00534  0.973    8.69e-3    1.80   
 7  64.8     2471.    66.7  0.0723 -1.86    0.00547  0.972    1.00e-2   -1.91   
 8  71.6     2634.    70.7  0.113   0.840   0.0134   0.979    5.08e-3    0.864  
 9  70.2     2581.    69.4  0.0874  0.808   0.00799  0.979    2.77e-3    0.830  
10  65.9     2424.    65.5  0.0873  0.414   0.00796  0.980    7.25e-4    0.425  
# … with 190 more rows

5.2 Residuals vs fitted values

Write the code to plot residual vs fitted values

qplot(data=slrfit_values, y=.resid, x=.fitted)


5.3 Normality test of residuals

qplot(data=slrfit_values, x=.resid, geom=c("histogram"))+
  geom_histogram(color="black", fill="lightblue")

ggplot(slrfit_values, 
       aes(sample=.resid))+
  stat_qq() + stat_qq_line()+labs(x="Theoretical Quantiles", y="Sample Quantiles")

shapiro.test(slrfit_values$.resid)

    Shapiro-Wilk normality test

data:  slrfit_values$.resid
W = 0.99755, p-value = 0.9897

H0:

H1:

Decision:

Conclusion:

6 Interpretation of the model output

summary(slrfit)

Call:
lm(formula = price ~ houseSize, data = house)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.67119 -0.63937  0.00189  0.67506  2.79721 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 5.1577228  1.7063810   3.023  0.00284 ** 
houseSize   0.0248925  0.0006813  36.535  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9781 on 198 degrees of freedom
Multiple R-squared:  0.8708,    Adjusted R-squared:  0.8702 
F-statistic:  1335 on 1 and 198 DF,  p-value: < 2.2e-16

6.1 Coefficient of determination

\(R^2 = 87.02\%\)

Interpretation:





6.2 Point estimate of \(\beta_0\)

Interpretation:



6.3 Intervals estimate of \(\beta_0\)

Interpretation:



6.4 Point estimate of \(\beta_1\)

Interpretation:



6.5 Intervals estimate of \(\beta_1\)

Interpretation:



7 Hypothesis testing on the Slope and Intercept

























8 Prediction of New Observations

newhouseSize <- data.frame(houseSize = c(2000, 2500, 2300, 2400, 2760))
predict(slrfit, newdata=newhouseSize , interval="predict")
       fit      lwr      upr
1 54.94275 52.89457 56.99092
2 67.38900 65.45527 69.32274
3 62.41050 60.45773 64.36326
4 64.89975 62.96113 66.83837
5 73.86105 71.89660 75.82551