library(tidyverse)
library(broom)
house <- read_csv("house.csv")
attach(house)
glimpse(house)
Rows: 200
Columns: 2
$ price <dbl> 68.92162, 63.97395, 69.51606, 68.86582, 69.24784, 68.54808,…
$ houseSize <dbl> 2503.489, 2362.630, 2522.676, 2491.663, 2511.292, 2476.032,…
summary(house)
price houseSize
Min. :59.89 Min. :2208
1st Qu.:65.79 1st Qu.:2437
Median :67.28 Median :2504
Mean :67.45 Mean :2502
3rd Qu.:69.27 3rd Qu.:2579
Max. :74.02 Max. :2769
qplot(data=house, x=price, geom=c("histogram"))+
geom_histogram(color="black", fill="#d95f02")
Interpretation:
qplot(data=house, x=houseSize, geom=c("histogram"))+
geom_histogram(color="black", fill="#1b9e77")
Interpretation:
qplot(data=house, y=price, x=houseSize) +
xlab("House size (sqft)") +
ylab("Price") +
labs(title="Price vs House size (sqft)")
cor(house$price, house$houseSize)
[1] 0.9331807
Interpretation:
slrfit <- lm(price ~ houseSize, data=house)
slrfit
Call:
lm(formula = price ~ houseSize, data = house)
Coefficients:
(Intercept) houseSize
5.15772 0.02489
qplot(data=house, y=price, x=houseSize) +
xlab("House size (sqft)") +
ylab("Price") +
labs(title="Price vs House size (sqft)") +
geom_abline(intercept = 5.15772, slope = 0.02489, colour="forestgreen", lwd=2)
Write the fitted regression model:
slrfit_values <- augment(slrfit)
slrfit_values
# A tibble: 200 x 9
price houseSize .fitted .se.fit .resid .hat .sigma .cooksd .std.resid
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 68.9 2503. 67.5 0.0692 1.45 0.00500 0.975 5.52e-3 1.48
2 64.0 2363. 64.0 0.118 0.00442 0.0145 0.981 1.52e-7 0.00455
3 69.5 2523. 68.0 0.0705 1.56 0.00520 0.974 6.70e-3 1.60
4 68.9 2492. 67.2 0.0696 1.68 0.00506 0.973 7.57e-3 1.73
5 69.2 2511. 67.7 0.0694 1.58 0.00504 0.974 6.62e-3 1.62
6 68.5 2476. 66.8 0.0715 1.76 0.00534 0.973 8.69e-3 1.80
7 64.8 2471. 66.7 0.0723 -1.86 0.00547 0.972 1.00e-2 -1.91
8 71.6 2634. 70.7 0.113 0.840 0.0134 0.979 5.08e-3 0.864
9 70.2 2581. 69.4 0.0874 0.808 0.00799 0.979 2.77e-3 0.830
10 65.9 2424. 65.5 0.0873 0.414 0.00796 0.980 7.25e-4 0.425
# … with 190 more rows
Write the code to plot residual vs fitted values
qplot(data=slrfit_values, y=.resid, x=.fitted)
qplot(data=slrfit_values, x=.resid, geom=c("histogram"))+
geom_histogram(color="black", fill="lightblue")
ggplot(slrfit_values,
aes(sample=.resid))+
stat_qq() + stat_qq_line()+labs(x="Theoretical Quantiles", y="Sample Quantiles")
shapiro.test(slrfit_values$.resid)
Shapiro-Wilk normality test
data: slrfit_values$.resid
W = 0.99755, p-value = 0.9897
H0:
H1:
Decision:
Conclusion:
summary(slrfit)
Call:
lm(formula = price ~ houseSize, data = house)
Residuals:
Min 1Q Median 3Q Max
-2.67119 -0.63937 0.00189 0.67506 2.79721
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.1577228 1.7063810 3.023 0.00284 **
houseSize 0.0248925 0.0006813 36.535 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.9781 on 198 degrees of freedom
Multiple R-squared: 0.8708, Adjusted R-squared: 0.8702
F-statistic: 1335 on 1 and 198 DF, p-value: < 2.2e-16
\(R^2 = 87.02\%\)
Interpretation:
Interpretation:
Interpretation:
Interpretation:
Interpretation:
newhouseSize <- data.frame(houseSize = c(2000, 2500, 2300, 2400, 2760))
predict(slrfit, newdata=newhouseSize , interval="predict")
fit lwr upr
1 54.94275 52.89457 56.99092
2 67.38900 65.45527 69.32274
3 62.41050 60.45773 64.36326
4 64.89975 62.96113 66.83837
5 73.86105 71.89660 75.82551