library(tidyverse)
library(performance)
library(ggeffects)
library(gtsummary)
library(gt)
library(flextable)
library(equatiomatic)
library(sjPlot)
library(emmeans)
library(effectsize)
library(randomForest)
library(vip)
Multiple Linear Regression in R
Introduction
This tutorial covers the process of performing multiple linear regression in R using the mtcars
dataset. We will explore data preparation, model building, assumption checking, visualization, and interpretation of the results.
Description
Multiple linear regression is a statistical technique that models the relationship between a dependent variable and multiple independent variables. In this tutorial, we will:
- Load necessary packages for data manipulation and visualization.
- Prepare the dataset by filtering and sampling data.
- Build a linear regression model to predict miles per gallon based on horsepower, weight, and cylinder count.
- Check model assumptions to ensure the validity of our model.
- Visualize predictions and create informative plots.
- Generate and save tables summarizing the regression results.
- Interpret the results, focusing on variable impacts and statistical significance.
- Evaluate variable importance using effect sizes and random forest methods.
Load Packages
Data Preparation
<- mtcars
df $cyl <- factor(df$cyl)
dfglimpse(df)
Rows: 32
Columns: 11
$ mpg <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2, 17.8,…
$ cyl <fct> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 8,…
$ disp <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 140.8, 16…
$ hp <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, 180, 180…
$ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92,…
$ wt <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3.150, 3.…
$ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 22.90, 18…
$ vs <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,…
$ am <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,…
$ gear <dbl> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3,…
$ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2,…
Table 1 shows the first 6 cars from the datasets
|>
df slice_head(n = 6) |>
select(mpg,hp, wt, cyl,disp) |>
gt()
mpg | hp | wt | cyl | disp |
---|---|---|---|---|
21.0 | 110 | 2.620 | 6 | 160 |
21.0 | 110 | 2.875 | 6 | 160 |
22.8 | 93 | 2.320 | 4 | 108 |
21.4 | 110 | 3.215 | 6 | 258 |
18.7 | 175 | 3.440 | 8 | 360 |
18.1 | 105 | 3.460 | 6 | 225 |
Build the Model
<- lm(mpg ~ hp + wt + cyl + disp, data = df)
fit summary(fit)
Call:
lm(formula = mpg ~ hp + wt + cyl + disp, data = df)
Residuals:
Min 1Q Median 3Q Max
-4.2740 -1.0349 -0.3831 0.9810 5.4192
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 36.002405 2.130726 16.897 1.54e-15 ***
hp -0.023517 0.012216 -1.925 0.06523 .
wt -3.428626 1.055455 -3.248 0.00319 **
cyl6 -3.466011 1.462979 -2.369 0.02554 *
cyl8 -3.753227 2.813996 -1.334 0.19385
disp 0.004199 0.012917 0.325 0.74774
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.482 on 26 degrees of freedom
Multiple R-squared: 0.8578, Adjusted R-squared: 0.8305
F-statistic: 31.37 on 5 and 26 DF, p-value: 3.18e-10
Check Model Assumptions
par(mfrow = c(2,2))
plot(fit)
check_model(fit)
check_normality(fit)
OK: residuals appear as normally distributed (p = 0.172).
check_heteroscedasticity(fit)
OK: Error variance appears to be homoscedastic (p = 0.066).
# Save model diagnostics plot to a PNG file
png("figures/diagnostic_plot.png", width = 10, height = 10, units = "in", res = 600)
check_model(fit)
dev.off()
png
2
# Display the plot in the Quarto document
::include_graphics("figures/diagnostic_plot.png") knitr
Visualize Prediction
Marginal Effects Plot or “Effect Plot
ggeffect(fit)
$hp
# Predicted values of mpg
hp | Predicted | 95% CI
------------------------------
50 | 22.36 | 19.77, 24.95
85 | 21.54 | 19.75, 23.33
120 | 20.72 | 19.59, 21.84
155 | 19.90 | 18.97, 20.82
195 | 18.95 | 17.44, 20.47
230 | 18.13 | 15.85, 20.41
265 | 17.31 | 14.20, 20.41
335 | 15.66 | 10.85, 20.48
$wt
# Predicted values of mpg
wt | Predicted | 95% CI
-------------------------------
1.40 | 26.32 | 22.28, 30.37
2.00 | 24.26 | 21.47, 27.05
2.40 | 22.89 | 20.90, 24.88
2.80 | 21.52 | 20.24, 22.80
3.40 | 19.46 | 18.48, 20.45
4.00 | 17.41 | 15.48, 19.33
4.40 | 16.04 | 13.32, 18.76
5.40 | 12.61 | 7.79, 17.43
$cyl
# Predicted values of mpg
cyl | Predicted | 95% CI
------------------------------
4 | 22.49 | 19.37, 25.62
6 | 19.02 | 16.75, 21.30
8 | 18.74 | 15.72, 21.76
$disp
# Predicted values of mpg
disp | Predicted | 95% CI
-------------------------------
70 | 19.42 | 15.05, 23.78
120 | 19.63 | 16.55, 22.70
170 | 19.84 | 17.99, 21.68
220 | 20.05 | 19.10, 20.99
275 | 20.28 | 18.79, 21.76
325 | 20.49 | 17.83, 23.15
375 | 20.70 | 16.76, 24.63
475 | 21.12 | 14.57, 27.66
attr(,"class")
[1] "ggalleffects" "list"
attr(,"model.name")
[1] "fit"
# Marginal Effects Plot
<- ggeffect(fit) %>%
effects_plot plot() %>%
::plot_grid() sjPlot
effects_plot
TableGrob (2 x 2) "arrange": 4 grobs
z cells name grob
hp 1 (1-1,1-1) arrange gtable[layout]
wt 2 (1-1,2-2) arrange gtable[layout]
cyl 3 (2-2,1-1) arrange gtable[layout]
disp 4 (2-2,2-2) arrange gtable[layout]
Save the Effects Plot
setwd("D:/upwork")
ggsave("figures/effects_plot.png",
plot = effects_plot,
device = png, dpi = 600, width = 9, height = 7)
Get summary Table
<- tbl_regression(
summary_table
fit,pvalue_fun = ~style_pvalue(.x, digits = 3)
%>%
) bold_p()
summary_table
Characteristic | Beta | 95% CI1 | p-value |
---|---|---|---|
hp | -0.02 | -0.05, 0.00 | 0.065 |
wt | -3.4 | -5.6, -1.3 | 0.003 |
cyl | |||
4 | — | — | |
6 | -3.5 | -6.5, -0.46 | 0.026 |
8 | -3.8 | -9.5, 2.0 | 0.194 |
disp | 0.00 | -0.02, 0.03 | 0.748 |
1 CI = Confidence Interval |
# Create regression table with t-values
<- tbl_regression(fit) %>%
summary_table2 modify_header(label ~ "**Variable**", estimate ~ "**Estimate**", std.error ~ "**Std. Error**", statistic ~ "**t value**", p.value ~ "**P-value**") %>%
bold_labels()
summary_table2
Variable | Estimate | Std. Error1 | t value | 95% CI1 | P-value |
---|---|---|---|---|---|
hp | -0.02 | 0.012 | -1.93 | -0.05, 0.00 | 0.065 |
wt | -3.4 | 1.06 | -3.25 | -5.6, -1.3 | 0.003 |
cyl | |||||
4 | — | — | — | — | |
6 | -3.5 | 1.46 | -2.37 | -6.5, -0.46 | 0.026 |
8 | -3.8 | 2.81 | -1.33 | -9.5, 2.0 | 0.2 |
disp | 0.00 | 0.013 | 0.325 | -0.02, 0.03 | 0.7 |
1 SE = Standard Error, CI = Confidence Interval |
Save the Summary Table
%>%
summary_table2 as_flex_table() %>%
save_as_docx(path = "tables/summary_table2.docx")
%>%
summary_table2 as_flex_table() %>%
save_as_image(path = "tables/summary_table2.png")
[1] "tables/summary_table2.png"
Interpretation
- Beta for horsepower is -0.02: Every increase of 1 horsepower results in a decrease in mpg by 0.02, assuming all other predictors remain constant.
- Weight Impact: Heavier cars tend to have lower mpg.
- Cylinder Impact: More cylinders often correlate with lower mpg.
Produce Model Equation
extract_eq(fit)
\[ \operatorname{mpg} = \alpha + \beta_{1}(\operatorname{hp}) + \beta_{2}(\operatorname{wt}) + \beta_{3}(\operatorname{cyl}_{\operatorname{6}}) + \beta_{4}(\operatorname{cyl}_{\operatorname{8}}) + \beta_{5}(\operatorname{disp}) + \epsilon \]
Multivariable vs. Univariable Results
tbl_regression(lm(mpg ~ cyl, data = df))
Characteristic | Beta | 95% CI1 | p-value |
---|---|---|---|
cyl | |||
4 | — | — | |
6 | -6.9 | -10, -3.7 | <0.001 |
8 | -12 | -14, -8.9 | <0.001 |
1 CI = Confidence Interval |
<- lm(mpg ~ hp * cyl, data = df)
fit2 plot_model(fit2, type = "int")
emmeans(fit2, pairwise ~ cyl | hp, weights = "prop")$contrasts
hp = 147:
contrast estimate SE df t.ratio p.value
cyl4 - cyl6 -0.117 3.50 26 -0.033 0.9994
cyl4 - cyl8 3.450 3.34 26 1.034 0.5628
cyl6 - cyl8 3.566 2.14 26 1.668 0.2365
P value adjustment: tukey method for comparing a family of 3 estimates
Variable Importance
variable importance based on effects size
# variable importance based on effects size
<- eta_squared(fit) %>%
ef_size mutate(Interpret = interpret_eta_squared(Eta2_partial))
ef_size
# Effect Size for ANOVA
Parameter | Eta2 (partial) | 95% CI | Interpret
------------------------------------------------------
hp | 0.81 | [0.69, 1.00] | large
wt | 0.61 | [0.40, 1.00] | large
cyl | 0.18 | [0.00, 1.00] | large
disp | 4.05e-03 | [0.00, 1.00] | very small
%>%
ef_size ggplot(aes(x = reorder(Parameter, Eta2_partial), y = Eta2_partial)) +
geom_bar(stat = "identity") +
geom_text(aes(label = Interpret), size = 4, hjust = -0.1, fontface = "bold") +
coord_flip() +
xlab("")
variable importance plot using the random forest method
# Generate a variable importance plot using the random forest method
randomForest(mpg ~ hp + wt + cyl + disp, data = df) %>%
vip()
t-values as important
# t-values as important
vip(fit)