library(tidyverse)
library(performance)
library(ggeffects)
library(gtsummary)
library(gt)
library(flextable)
library(equatiomatic)
library(sjPlot)
library(emmeans)
library(effectsize)
library(randomForest)
library(vip)Multiple Linear Regression in R
Introduction
This tutorial covers the process of performing multiple linear regression in R using the mtcars dataset. We will explore data preparation, model building, assumption checking, visualization, and interpretation of the results.
Description
Multiple linear regression is a statistical technique that models the relationship between a dependent variable and multiple independent variables. In this tutorial, we will:
- Load necessary packages for data manipulation and visualization.
- Prepare the dataset by filtering and sampling data.
- Build a linear regression model to predict miles per gallon based on horsepower, weight, and cylinder count.
- Check model assumptions to ensure the validity of our model.
- Visualize predictions and create informative plots.
- Generate and save tables summarizing the regression results.
- Interpret the results, focusing on variable impacts and statistical significance.
- Evaluate variable importance using effect sizes and random forest methods.
Load Packages
Data Preparation
df <- mtcars
df$cyl <- factor(df$cyl)
glimpse(df)Rows: 32
Columns: 11
$ mpg  <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2, 17.8,…
$ cyl  <fct> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 8,…
$ disp <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 140.8, 16…
$ hp   <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, 180, 180…
$ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92,…
$ wt   <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3.150, 3.…
$ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 22.90, 18…
$ vs   <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,…
$ am   <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,…
$ gear <dbl> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3,…
$ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2,…Table 1 shows the first 6 cars from the datasets
df |>
  slice_head(n = 6) |>
  select(mpg,hp, wt, cyl,disp) |>
  gt()| mpg | hp | wt | cyl | disp | 
|---|---|---|---|---|
| 21.0 | 110 | 2.620 | 6 | 160 | 
| 21.0 | 110 | 2.875 | 6 | 160 | 
| 22.8 | 93 | 2.320 | 4 | 108 | 
| 21.4 | 110 | 3.215 | 6 | 258 | 
| 18.7 | 175 | 3.440 | 8 | 360 | 
| 18.1 | 105 | 3.460 | 6 | 225 | 
Build the Model
fit <- lm(mpg ~ hp + wt + cyl + disp, data = df)
summary(fit)
Call:
lm(formula = mpg ~ hp + wt + cyl + disp, data = df)
Residuals:
    Min      1Q  Median      3Q     Max 
-4.2740 -1.0349 -0.3831  0.9810  5.4192 
Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 36.002405   2.130726  16.897 1.54e-15 ***
hp          -0.023517   0.012216  -1.925  0.06523 .  
wt          -3.428626   1.055455  -3.248  0.00319 ** 
cyl6        -3.466011   1.462979  -2.369  0.02554 *  
cyl8        -3.753227   2.813996  -1.334  0.19385    
disp         0.004199   0.012917   0.325  0.74774    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.482 on 26 degrees of freedom
Multiple R-squared:  0.8578,    Adjusted R-squared:  0.8305 
F-statistic: 31.37 on 5 and 26 DF,  p-value: 3.18e-10Check Model Assumptions
par(mfrow = c(2,2))
plot(fit)
check_model(fit)
check_normality(fit)OK: residuals appear as normally distributed (p = 0.172).check_heteroscedasticity(fit)OK: Error variance appears to be homoscedastic (p = 0.066).# Save model diagnostics plot to a PNG file
png("figures/diagnostic_plot.png", width = 10, height = 10, units = "in", res = 600)
check_model(fit)
dev.off()png 
  2 # Display the plot in the Quarto document
knitr::include_graphics("figures/diagnostic_plot.png")
Visualize Prediction
Marginal Effects Plot or “Effect Plot
ggeffect(fit)$hp
# Predicted values of mpg
 hp | Predicted |       95% CI
------------------------------
 50 |     22.36 | 19.77, 24.95
 85 |     21.54 | 19.75, 23.33
120 |     20.72 | 19.59, 21.84
155 |     19.90 | 18.97, 20.82
195 |     18.95 | 17.44, 20.47
230 |     18.13 | 15.85, 20.41
265 |     17.31 | 14.20, 20.41
335 |     15.66 | 10.85, 20.48
$wt
# Predicted values of mpg
  wt | Predicted |       95% CI
-------------------------------
1.40 |     26.32 | 22.28, 30.37
2.00 |     24.26 | 21.47, 27.05
2.40 |     22.89 | 20.90, 24.88
2.80 |     21.52 | 20.24, 22.80
3.40 |     19.46 | 18.48, 20.45
4.00 |     17.41 | 15.48, 19.33
4.40 |     16.04 | 13.32, 18.76
5.40 |     12.61 |  7.79, 17.43
$cyl
# Predicted values of mpg
cyl | Predicted |       95% CI
------------------------------
4   |     22.49 | 19.37, 25.62
6   |     19.02 | 16.75, 21.30
8   |     18.74 | 15.72, 21.76
$disp
# Predicted values of mpg
disp | Predicted |       95% CI
-------------------------------
  70 |     19.42 | 15.05, 23.78
 120 |     19.63 | 16.55, 22.70
 170 |     19.84 | 17.99, 21.68
 220 |     20.05 | 19.10, 20.99
 275 |     20.28 | 18.79, 21.76
 325 |     20.49 | 17.83, 23.15
 375 |     20.70 | 16.76, 24.63
 475 |     21.12 | 14.57, 27.66
attr(,"class")
[1] "ggalleffects" "list"        
attr(,"model.name")
[1] "fit"# Marginal Effects Plot
effects_plot <- ggeffect(fit) %>% 
  plot() %>% 
  sjPlot::plot_grid()
effects_plotTableGrob (2 x 2) "arrange": 4 grobs
     z     cells    name           grob
hp   1 (1-1,1-1) arrange gtable[layout]
wt   2 (1-1,2-2) arrange gtable[layout]
cyl  3 (2-2,1-1) arrange gtable[layout]
disp 4 (2-2,2-2) arrange gtable[layout]Save the Effects Plot
setwd("D:/upwork")
ggsave("figures/effects_plot.png",
       plot = effects_plot,
       device = png, dpi = 600, width = 9, height = 7)Get summary Table
summary_table <- tbl_regression(
  fit,
  pvalue_fun = ~style_pvalue(.x, digits = 3)
) %>% 
  bold_p()
summary_table| Characteristic | Beta | 95% CI1 | p-value | 
|---|---|---|---|
| hp | -0.02 | -0.05, 0.00 | 0.065 | 
| wt | -3.4 | -5.6, -1.3 | 0.003 | 
| cyl | |||
| 4 | — | — | |
| 6 | -3.5 | -6.5, -0.46 | 0.026 | 
| 8 | -3.8 | -9.5, 2.0 | 0.194 | 
| disp | 0.00 | -0.02, 0.03 | 0.748 | 
| 1 CI = Confidence Interval | |||
# Create regression table with t-values
summary_table2 <- tbl_regression(fit) %>%
  modify_header(label ~ "**Variable**", estimate ~ "**Estimate**", std.error ~ "**Std. Error**", statistic ~ "**t value**", p.value ~ "**P-value**") %>%
  bold_labels()
summary_table2| Variable | Estimate | Std. Error1 | t value | 95% CI1 | P-value | 
|---|---|---|---|---|---|
| hp | -0.02 | 0.012 | -1.93 | -0.05, 0.00 | 0.065 | 
| wt | -3.4 | 1.06 | -3.25 | -5.6, -1.3 | 0.003 | 
| cyl | |||||
| 4 | — | — | — | — | |
| 6 | -3.5 | 1.46 | -2.37 | -6.5, -0.46 | 0.026 | 
| 8 | -3.8 | 2.81 | -1.33 | -9.5, 2.0 | 0.2 | 
| disp | 0.00 | 0.013 | 0.325 | -0.02, 0.03 | 0.7 | 
| 1 SE = Standard Error, CI = Confidence Interval | |||||
Save the Summary Table
summary_table2 %>% 
  as_flex_table() %>% 
  save_as_docx(path = "tables/summary_table2.docx")
summary_table2 %>% 
  as_flex_table() %>% 
  save_as_image(path = "tables/summary_table2.png")[1] "tables/summary_table2.png"Interpretation
- Beta for horsepower is -0.02: Every increase of 1 horsepower results in a decrease in mpg by 0.02, assuming all other predictors remain constant.
- Weight Impact: Heavier cars tend to have lower mpg.
- Cylinder Impact: More cylinders often correlate with lower mpg.
Produce Model Equation
extract_eq(fit)\[ \operatorname{mpg} = \alpha + \beta_{1}(\operatorname{hp}) + \beta_{2}(\operatorname{wt}) + \beta_{3}(\operatorname{cyl}_{\operatorname{6}}) + \beta_{4}(\operatorname{cyl}_{\operatorname{8}}) + \beta_{5}(\operatorname{disp}) + \epsilon \]
Multivariable vs. Univariable Results
tbl_regression(lm(mpg ~ cyl, data = df))| Characteristic | Beta | 95% CI1 | p-value | 
|---|---|---|---|
| cyl | |||
| 4 | — | — | |
| 6 | -6.9 | -10, -3.7 | <0.001 | 
| 8 | -12 | -14, -8.9 | <0.001 | 
| 1 CI = Confidence Interval | |||
fit2 <- lm(mpg ~ hp * cyl, data = df)
plot_model(fit2, type = "int")
emmeans(fit2, pairwise ~ cyl | hp, weights = "prop")$contrastshp = 147:
 contrast    estimate   SE df t.ratio p.value
 cyl4 - cyl6   -0.117 3.50 26  -0.033  0.9994
 cyl4 - cyl8    3.450 3.34 26   1.034  0.5628
 cyl6 - cyl8    3.566 2.14 26   1.668  0.2365
P value adjustment: tukey method for comparing a family of 3 estimates Variable Importance
variable importance based on effects size
# variable importance based on effects size
ef_size <- eta_squared(fit) %>% 
  mutate(Interpret = interpret_eta_squared(Eta2_partial))
ef_size# Effect Size for ANOVA
Parameter | Eta2 (partial) |       95% CI |  Interpret
------------------------------------------------------
hp        |           0.81 | [0.69, 1.00] |      large
wt        |           0.61 | [0.40, 1.00] |      large
cyl       |           0.18 | [0.00, 1.00] |      large
disp      |       4.05e-03 | [0.00, 1.00] | very smallef_size %>% 
  ggplot(aes(x = reorder(Parameter, Eta2_partial), y = Eta2_partial)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = Interpret), size = 4, hjust = -0.1, fontface = "bold") +
  coord_flip() +
  xlab("") 
variable importance plot using the random forest method
# Generate a variable importance plot using the random forest method
randomForest(mpg ~ hp + wt + cyl + disp, data = df) %>% 
  vip()
t-values as important
# t-values as important
vip(fit)