Regularized regression

Keywords

biostatistics, healthcare, statistics, R, statistical testing, power analysis, GLM, regression

Introduction

This notebook demonstrates five regularized regression methods using the classic Prostate Cancer Dataset (Stamey et al., 1989).

Dataset

The prostate cancer dataset contains measurements from 97 men with prostate cancer who were about to receive radical prostatectomy. The goal is to predict log(PSA) levels based on clinical measurements.

required_packages <- c(
  "glmnet", "pls", "MASS", "patchwork",
  "reshape2", "tidyr", "dplyr", "tidyverse"
)

new_packages <- required_packages[!(required_packages %in% installed.packages()[, "Package"])]
if (length(new_packages)) install.packages(new_packages)

library(glmnet) # For Ridge, Lasso, Elastic Net

Loading required package: Matrix

Loaded glmnet 4.1-10

library(pls) # For PLS and PCR


Attaching package: 'pls'

The following object is masked from 'package:stats':

    loadings

library(MASS) # For additional statistical functions
library(ggplot2) # For visualizations
library(patchwork) # For arranging multiple plots


Attaching package: 'patchwork'

The following object is masked from 'package:MASS':

    area

library(reshape2) # For data manipulation
library(tidyr) # For data tidying


Attaching package: 'tidyr'

The following object is masked from 'package:reshape2':

    smiths

The following objects are masked from 'package:Matrix':

    expand, pack, unpack

library(dplyr) # For data manipulation


Attaching package: 'dplyr'

The following object is masked from 'package:MASS':

    select

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

theme_set(theme_minimal(base_size = 12))

prostate_url <- "https://hastie.su.domains/ElemStatLearn/datasets/prostate.data"
prostate <- read.table(prostate_url, header = TRUE, row.names = 1)

Data overview

head(prostate)

      lcavol  lweight age      lbph svi       lcp gleason pgg45       lpsa
1 -0.5798185 2.769459  50 -1.386294   0 -1.386294       6     0 -0.4307829
2 -0.9942523 3.319626  58 -1.386294   0 -1.386294       6     0 -0.1625189
3 -0.5108256 2.691243  74 -1.386294   0 -1.386294       7    20 -0.1625189
4 -1.2039728 3.282789  58 -1.386294   0 -1.386294       6     0 -0.1625189
5  0.7514161 3.432373  62 -1.386294   0 -1.386294       6     0  0.3715636
6 -1.0498221 3.228826  50 -1.386294   0 -1.386294       6     0  0.7654678
  train
1  TRUE
2  TRUE
3  TRUE
4  TRUE
5  TRUE
6  TRUE

summary(prostate)

     lcavol           lweight           age             lbph        
 Min.   :-1.3471   Min.   :2.375   Min.   :41.00   Min.   :-1.3863  
 1st Qu.: 0.5128   1st Qu.:3.376   1st Qu.:60.00   1st Qu.:-1.3863  
 Median : 1.4469   Median :3.623   Median :65.00   Median : 0.3001  
 Mean   : 1.3500   Mean   :3.629   Mean   :63.87   Mean   : 0.1004  
 3rd Qu.: 2.1270   3rd Qu.:3.876   3rd Qu.:68.00   3rd Qu.: 1.5581  
 Max.   : 3.8210   Max.   :4.780   Max.   :79.00   Max.   : 2.3263  
      svi              lcp             gleason          pgg45       
 Min.   :0.0000   Min.   :-1.3863   Min.   :6.000   Min.   :  0.00  
 1st Qu.:0.0000   1st Qu.:-1.3863   1st Qu.:6.000   1st Qu.:  0.00  
 Median :0.0000   Median :-0.7985   Median :7.000   Median : 15.00  
 Mean   :0.2165   Mean   :-0.1794   Mean   :6.753   Mean   : 24.38  
 3rd Qu.:0.0000   3rd Qu.: 1.1787   3rd Qu.:7.000   3rd Qu.: 40.00  
 Max.   :1.0000   Max.   : 2.9042   Max.   :9.000   Max.   :100.00  
      lpsa           train        
 Min.   :-0.4308   Mode :logical  
 1st Qu.: 1.7317   FALSE:30       
 Median : 2.5915   TRUE :67       
 Mean   : 2.4784                  
 3rd Qu.: 3.0564                  
 Max.   : 5.5829

Variable Descriptions

lcavol: log(cancer volume)
lweight: log(prostate weight)
age: patient age in years
lbph: log(benign prostatic hyperplasia amount)
svi: seminal vesicle invasion (0/1)
lcp: log(capsular penetration)
gleason: Gleason score (6-9)
pgg45: percentage Gleason scores 4 or 5
lpsa: log(PSA) - outcome variable
train: training set indicator

Prepare data

train_data <- prostate[prostate$train == TRUE, ]
test_data <- prostate[prostate$train == FALSE, ]

train_data <- train_data[, -10]
test_data <- test_data[, -10]

# All predictors
x_train <- as.matrix(train_data[, -9]) 
y_train <- train_data$lpsa

x_test <- as.matrix(test_data[, -9])
y_test <- test_data$lpsa

# Standardize predictors --> Imp
x_train_scaled <- scale(x_train)
x_test_scaled <- scale(x_test,
  center = attr(x_train_scaled, "scaled:center"),
  scale = attr(x_train_scaled, "scaled:scale")
)

cat("Training set size:", nrow(train_data), "\n")

Training set size: 67

cat("Test set size:", nrow(test_data), "\n")

Test set size: 30

cat("Number of predictors:", ncol(x_train), "\n")

Number of predictors: 8

EDA

Correlations across features

cor_matrix <- cor(x_train)

get_upper_tri <- function(cormat) {
  cormat[lower.tri(cormat)] <- NA
  return(cormat)
}
upper_tri <- get_upper_tri(cor_matrix)
cor_melted <- melt(upper_tri, na.rm = TRUE)

p_corr <- ggplot(cor_melted, aes(Var1, Var2, fill = value)) +
  geom_tile(color = "white") +
  geom_text(aes(label = sprintf("%.2f", value)), size = 3) +
  scale_fill_gradient2(
    low = "blue", high = "red", mid = "white",
    midpoint = 0, limit = c(-1, 1),
    name = "Correlation"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.x = element_blank(),
    axis.title.y = element_blank()
  ) +
  labs(title = "Correlation Matrix of Prostate Cancer Predictors") +
  coord_fixed()

p_corr

Figure 1: Correlation matrix showing relationships among predictors.

Note the high correlation between lcavol and lcp (0.69) –> multicollinearity!

Response variable description

p_dist <- ggplot(data.frame(lpsa = y_train), aes(x = lpsa)) +
  geom_histogram(bins = 20, fill = "skyblue", color = "white", alpha = 0.8) +
  labs(
    title = "Distribution of log(PSA) in Training Set",
    x = "log(PSA)", y = "Frequency"
  ) +
  theme_minimal(base_size = 12)

p_dist

Figure 2: Distribution of log(PSA) in the training set.

Important

The distribution is approximately normal, which is suitable for regression analysis. However, the correlation matrix reveals multicollinearity among predictors, particularly between: - lcavol and lcp (r = 0.69) - lcavol and svi (r = 0.59)

This makes regularization methods particularly valuable for this dataset.

OLS (Baseline)

Before applying regularization, let’s fit a standard multiple linear regression model (OLS) as our baseline for comparison.

ols_model <- lm(lpsa ~ ., data = train_data)

ols_pred_train <- predict(ols_model, newdata = train_data)
ols_pred_test <- predict(ols_model, newdata = test_data)

ols_train_mse <- mean((y_train - ols_pred_train)^2)
ols_test_mse <- mean((y_test - ols_pred_test)^2)
ols_train_rmse <- sqrt(ols_train_mse)
ols_test_rmse <- sqrt(ols_test_mse)
ols_train_r2 <- summary(ols_model)$r.squared
ols_test_r2 <- 1 - sum((y_test - ols_pred_test)^2) / sum((y_test - mean(y_test))^2)

cat("OLS Train MSE:", round(ols_train_mse, 4), "\n")

OLS Train MSE: 0.4392

cat("OLS Test MSE:", round(ols_test_mse, 4), "\n")

OLS Test MSE: 0.5213

cat("OLS Train RMSE:", round(ols_train_rmse, 4), "\n")

OLS Train RMSE: 0.6627

cat("OLS Test RMSE:", round(ols_test_rmse, 4), "\n")

OLS Test RMSE: 0.722

cat("OLS Train R²:", round(ols_train_r2, 4), "\n")

OLS Train R²: 0.6944

cat("OLS Test R²:", round(ols_test_r2, 4), "\n")

OLS Test R²: 0.5034

print(summary(ols_model))


Call:
lm(formula = lpsa ~ ., data = train_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.64870 -0.34147 -0.05424  0.44941  1.48675 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.429170   1.553588   0.276  0.78334    
lcavol       0.576543   0.107438   5.366 1.47e-06 ***
lweight      0.614020   0.223216   2.751  0.00792 ** 
age         -0.019001   0.013612  -1.396  0.16806    
lbph         0.144848   0.070457   2.056  0.04431 *  
svi          0.737209   0.298555   2.469  0.01651 *  
lcp         -0.206324   0.110516  -1.867  0.06697 .  
gleason     -0.029503   0.201136  -0.147  0.88389    
pgg45        0.009465   0.005447   1.738  0.08755 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.7123 on 58 degrees of freedom
Multiple R-squared:  0.6944,    Adjusted R-squared:  0.6522 
F-statistic: 16.47 on 8 and 58 DF,  p-value: 2.042e-12

OLS - the baseline

The ordinary least squares model provides an unregularized baseline. With 8 predictors and 67 training observations, OLS may overfit, especially given the multicollinearity we observed.

Ridge regression

Ridge regression applies an L2 penalty to the coefficients, shrinking them toward zero but never exactly to zero. This is particularly effective for handling multicollinearity.

\[\min_{\beta} \sum_{i=1}^{n} (y_i - \beta_0 - \sum_{j=1}^{p} x_{ij}\beta_j)^2 + \lambda \sum_{j=1}^{p} \beta_j^2\]

set.seed(123)
# see: https://glmnet.stanford.edu/articles/glmnet.html
ridge_cv <- cv.glmnet(x_train_scaled, y_train, alpha = 0, nfolds = 10)

# optimal lambda
cat("Optimal lambda (Ridge):", ridge_cv$lambda.min, "\n")

Optimal lambda (Ridge): 0.08788804

# ridge fit
ridge_model <- glmnet(x_train_scaled, y_train,
  alpha = 0,
  lambda = ridge_cv$lambda.min
)

# Predict
ridge_pred_train <- predict(ridge_model, newx = x_train_scaled)
ridge_pred_test <- predict(ridge_model, newx = x_test_scaled)

# Performance metrics
ridge_train_mse <- mean((y_train - ridge_pred_train)^2)
ridge_test_mse <- mean((y_test - ridge_pred_test)^2)
ridge_train_rmse <- sqrt(ridge_train_mse)
ridge_test_rmse <- sqrt(ridge_test_mse)
ridge_train_r2 <- 1 - sum((y_train - ridge_pred_train)^2) / sum((y_train - mean(y_train))^2)
ridge_test_r2 <- 1 - sum((y_test - ridge_pred_test)^2) / sum((y_test - mean(y_test))^2)

cat("Ridge Train MSE:", round(ridge_train_mse, 4), "\n")

Ridge Train MSE: 0.4473

cat("Ridge Test MSE:", round(ridge_test_mse, 4), "\n")

Ridge Test MSE: 0.4944

cat("Ridge Train RMSE:", round(ridge_train_rmse, 4), "\n")

Ridge Train RMSE: 0.6688

cat("Ridge Test RMSE:", round(ridge_test_rmse, 4), "\n")

Ridge Test RMSE: 0.7031

cat("Ridge Train R²:", round(ridge_train_r2, 4), "\n")

Ridge Train R²: 0.6887

cat("Ridge Test R²:", round(ridge_test_r2, 4), "\n")

Ridge Test R²: 0.529

Ridge Visualizations

# Lambda selection
ridge_cv_df <- data.frame(
  lambda = ridge_cv$lambda,
  cvm = ridge_cv$cvm,
  cvlo = ridge_cv$cvlo,
  cvup = ridge_cv$cvup
)

p_ridge_cv <- ggplot(ridge_cv_df, aes(x = log(lambda), y = cvm)) +
  geom_point(color = "red", size = 2) +
  geom_errorbar(aes(ymin = cvlo, ymax = cvup), width = 0.1, alpha = 0.5) +
  geom_vline(xintercept = log(ridge_cv$lambda.min), linetype = "dashed", color = "blue") +
  geom_vline(xintercept = log(ridge_cv$lambda.1se), linetype = "dashed", color = "darkgreen") +
  labs(
    title = "Ridge regression: Cross-validation",
    x = "log(Lambda)",
    y = "Mean-squared error"
  ) +
  annotate("text",
    x = log(ridge_cv$lambda.min), y = max(ridge_cv_df$cvm),
    label = "lambda.min", hjust = -0.1, color = "blue"
  ) +
  theme_minimal(base_size = 12)

# Coefficient paths
ridge_full <- glmnet(x_train_scaled, y_train, alpha = 0)
ridge_coef_df <- as.data.frame(as.matrix(t(ridge_full$beta)))
ridge_coef_df$lambda <- ridge_full$lambda
ridge_coef_long <- pivot_longer(ridge_coef_df,
  cols = -lambda,
  names_to = "variable", values_to = "coefficient"
)

p_ridge_path <- ggplot(ridge_coef_long, aes(x = log(lambda), y = coefficient, color = variable)) +
  geom_line(linewidth = 1) +
  geom_vline(xintercept = log(ridge_cv$lambda.min), linetype = "dashed", color = "red") +
  labs(
    title = "Ridge regression: coefficient paths",
    x = "log(Lambda)",
    y = "Coefficient value",
    color = "Variable"
  ) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "right")

p_ridge_cv + p_ridge_path

Lasso regression

Lasso (Least absolute shrinkage and selection operator) applies an L1 penalty, which can shrink coefficients exactly to zero, performing automatic variable selection. See this notebook for details.

\[\min_{\beta} \sum_{i=1}^{n} (y_i - \beta_0 - \sum_{j=1}^{p} x_{ij}\beta_j)^2 + \lambda \sum_{j=1}^{p} |\beta_j|\]

# fit lasso
set.seed(32)
# see: https://glmnet.stanford.edu/articles/glmnet.html
lasso_cv <- cv.glmnet(x_train_scaled, y_train, alpha = 1, nfolds = 10)


cat("Optimal lambda (Lasso):", lasso_cv$lambda.min, "\n")

Optimal lambda (Lasso): 0.0133582

lasso_model <- glmnet(x_train_scaled, y_train,
  alpha = 1,
  lambda = lasso_cv$lambda.min
)


lasso_pred_train <- predict(lasso_model, newx = x_train_scaled)
lasso_pred_test <- predict(lasso_model, newx = x_test_scaled)


lasso_train_mse <- mean((y_train - lasso_pred_train)^2)
lasso_test_mse <- mean((y_test - lasso_pred_test)^2)
lasso_train_rmse <- sqrt(lasso_train_mse)
lasso_test_rmse <- sqrt(lasso_test_mse)
lasso_train_r2 <- 1 - sum((y_train - lasso_pred_train)^2) / sum((y_train - mean(y_train))^2)
lasso_test_r2 <- 1 - sum((y_test - lasso_pred_test)^2) / sum((y_test - mean(y_test))^2)

cat("Lasso Train MSE:", round(lasso_train_mse, 4), "\n")

Lasso Train MSE: 0.4428

cat("Lasso Test MSE:", round(lasso_test_mse, 4), "\n")

Lasso Test MSE: 0.4933

cat("Lasso Train RMSE:", round(lasso_train_rmse, 4), "\n")

Lasso Train RMSE: 0.6654

cat("Lasso Test RMSE:", round(lasso_test_rmse, 4), "\n")

Lasso Test RMSE: 0.7024

cat("Lasso Train R²:", round(lasso_train_r2, 4), "\n")

Lasso Train R²: 0.6919

cat("Lasso Test R²:", round(lasso_test_r2, 4), "\n")

Lasso Test R²: 0.53

# Selected variables
lasso_coefs <- coef(lasso_model)
selected_vars <- rownames(lasso_coefs)[lasso_coefs[, 1] != 0]
cat("\nSelected variables by Lasso:", paste(selected_vars[-1], collapse = ", "), "\n")


Selected variables by Lasso: lcavol, lweight, age, lbph, svi, lcp, pgg45

Lasso - visualise

# Lambda selection
lasso_cv_df <- data.frame(
  lambda = lasso_cv$lambda,
  cvm = lasso_cv$cvm,
  cvlo = lasso_cv$cvlo,
  cvup = lasso_cv$cvup
)

p_lasso_cv <- ggplot(lasso_cv_df, aes(x = log(lambda), y = cvm)) +
  geom_point(color = "red", size = 2) +
  geom_errorbar(aes(ymin = cvlo, ymax = cvup), width = 0.1, alpha = 0.5) +
  geom_vline(xintercept = log(lasso_cv$lambda.min), linetype = "dashed", color = "blue") +
  geom_vline(xintercept = log(lasso_cv$lambda.1se), linetype = "dashed", color = "darkgreen") +
  labs(
    title = "Lasso Regression: Cross-Validation",
    x = "log(Lambda)",
    y = "Mean-Squared Error"
  ) +
  annotate("text",
    x = log(lasso_cv$lambda.min), y = max(lasso_cv_df$cvm),
    label = "lambda.min", hjust = -0.1, color = "blue"
  ) +
  theme_minimal(base_size = 12)

# Coefficient paths (shows variable selection)
lasso_full <- glmnet(x_train_scaled, y_train, alpha = 1)
lasso_coef_df <- as.data.frame(as.matrix(t(lasso_full$beta)))
lasso_coef_df$lambda <- lasso_full$lambda
lasso_coef_long <- pivot_longer(lasso_coef_df,
  cols = -lambda,
  names_to = "variable", values_to = "coefficient"
)

p_lasso_path <- ggplot(lasso_coef_long, aes(x = log(lambda), y = coefficient, color = variable)) +
  geom_line(linewidth = 1) +
  geom_vline(xintercept = log(lasso_cv$lambda.min), linetype = "dashed", color = "red") +
  labs(
    title = "Lasso Regression: Coefficient Paths (Variable Selection)",
    x = "log(Lambda)",
    y = "Coefficient Value",
    color = "Variable"
  ) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "right")

p_lasso_cv + p_lasso_path

Figure 3: Lasso regression cross-validation and coefficient paths. Note how some coefficients are driven exactly to zero, performing automatic variable selection.

Principal Component Regression (PCR)

PCR performs unsupervised dimension reduction by finding components that maximize variance in the predictors (ignoring the response).

train_df <- data.frame(x_train_scaled, lpsa = y_train)
test_df <- data.frame(x_test_scaled, lpsa = y_test)

cat("=== PRINCIPAL COMPONENT REGRESSION ===\n")

=== PRINCIPAL COMPONENT REGRESSION ===

# Fit PCR with cross-validation
set.seed(123)
pcr_model <- pcr(lpsa ~ ., data = train_df, validation = "CV", segments = 10)

# Determine optimal number of components
pcr_cv_mse <- RMSEP(pcr_model, estimate = "CV")
optimal_ncomp_pcr <- which.min(pcr_cv_mse$val[1, , ]) - 1
cat("Optimal number of components (PCR):", optimal_ncomp_pcr, "\n")

Optimal number of components (PCR): 8

# Predictions
pcr_pred_train <- predict(pcr_model, newdata = train_df, ncomp = optimal_ncomp_pcr)
pcr_pred_test <- predict(pcr_model, newdata = test_df, ncomp = optimal_ncomp_pcr)

# Performance metrics
pcr_train_mse <- mean((y_train - pcr_pred_train)^2)
pcr_test_mse <- mean((y_test - pcr_pred_test)^2)
pcr_train_rmse <- sqrt(pcr_train_mse)
pcr_test_rmse <- sqrt(pcr_test_mse)
pcr_train_r2 <- 1 - sum((y_train - pcr_pred_train)^2) / sum((y_train - mean(y_train))^2)
pcr_test_r2 <- 1 - sum((y_test - pcr_pred_test)^2) / sum((y_test - mean(y_test))^2)

cat("PCR Train MSE:", round(pcr_train_mse, 4), "\n")

PCR Train MSE: 0.4392

cat("PCR Test MSE:", round(pcr_test_mse, 4), "\n")

PCR Test MSE: 0.5213

cat("PCR Train RMSE:", round(pcr_train_rmse, 4), "\n")

PCR Train RMSE: 0.6627

cat("PCR Test RMSE:", round(pcr_test_rmse, 4), "\n")

PCR Test RMSE: 0.722

cat("PCR Train R²:", round(pcr_train_r2, 4), "\n")

PCR Train R²: 0.6944

cat("PCR Test R²:", round(pcr_test_r2, 4), "\n")

PCR Test R²: 0.5034

PCR visualisation

pcr_rmsep_df <- data.frame(
  ncomp = 0:(ncol(x_train)),
  RMSEP = as.vector(pcr_cv_mse$val[1, , ])
)

p_pcr_cv <- ggplot(pcr_rmsep_df, aes(x = ncomp, y = RMSEP)) +
  geom_line(color = "darkgreen", linewidth = 1) +
  geom_point(color = "darkgreen", size = 2) +
  geom_vline(xintercept = optimal_ncomp_pcr, linetype = "dashed", color = "red") +
  labs(
    title = "PCR: cross-validation MSEP",
    x = "Number of components",
    y = "Root Mean Squared Error of Prediction (MSEP)"
  ) +
  annotate("text",
    x = optimal_ncomp_pcr, y = max(pcr_rmsep_df$RMSEP),
    label = paste("Optimal:", optimal_ncomp_pcr), hjust = -0.1, color = "red"
  ) +
  theme_minimal(base_size = 12)

# Explained variance
explvar_pcr <- pls::explvar(pcr_model)
explvar_df <- data.frame(
  Component = 1:length(explvar_pcr),
  Variance_Explained = explvar_pcr
)

p_pcr_var <- ggplot(explvar_df, aes(x = Component, y = Variance_Explained)) +
  geom_col(fill = "steelblue", alpha = 0.8) +
  labs(
    title = "PCR: explained variance by PCs",
    x = "PC",
    y = "% variance explained"
  ) +
  scale_x_continuous(breaks = 1:nrow(explvar_df)) +
  theme_minimal(base_size = 12)

p_pcr_cv + p_pcr_var

Figure 4: PCR: component selection and variance explained by each principal component.

Elastic Net Regression

Elastic Net combines both L1 and L2 penalties, providing a balance between Ridge and Lasso. The mixing parameter \(\alpha\) controls the balance (here \(\alpha = 0.5\)).

\[\min_{\beta} \sum_{i=1}^{n} (y_i - \beta_0 - \sum_{j=1}^{p} x_{ij}\beta_j)^2 + \lambda \left[\alpha \sum_{j=1}^{p} |\beta_j| + (1-\alpha) \sum_{j=1}^{p} \beta_j^2\right]\]

cat("=== ELASTIC NET REGRESSION ===\n")

=== ELASTIC NET REGRESSION ===

set.seed(123)
# Fit Elastic Net (alpha = 0.5 for equal mix of L1 and L2)

enet_cv <- cv.glmnet(x_train_scaled, y_train, alpha = 0.5, nfolds = 10)

# Optimal lambda
cat("Optimal lambda (Elastic Net):", enet_cv$lambda.min, "\n")

Optimal lambda (Elastic Net): 0.02020998

# Fit final model
enet_model <- glmnet(x_train_scaled, y_train,
  alpha = 0.5,
  lambda = enet_cv$lambda.min
)

# Predictions
enet_pred_train <- predict(enet_model, newx = x_train_scaled)
enet_pred_test <- predict(enet_model, newx = x_test_scaled)

# Performance metrics
enet_train_mse <- mean((y_train - enet_pred_train)^2)
enet_test_mse <- mean((y_test - enet_pred_test)^2)
enet_train_rmse <- sqrt(enet_train_mse)
enet_test_rmse <- sqrt(enet_test_mse)
enet_train_r2 <- 1 - sum((y_train - enet_pred_train)^2) / sum((y_train - mean(y_train))^2)
enet_test_r2 <- 1 - sum((y_test - enet_pred_test)^2) / sum((y_test - mean(y_test))^2)

cat("Elastic Net Train MSE:", round(enet_train_mse, 4), "\n")

Elastic Net Train MSE: 0.4423

cat("Elastic Net Test MSE:", round(enet_test_mse, 4), "\n")

Elastic Net Test MSE: 0.496

cat("Elastic Net Train RMSE:", round(enet_train_rmse, 4), "\n")

Elastic Net Train RMSE: 0.6651

cat("Elastic Net Test RMSE:", round(enet_test_rmse, 4), "\n")

Elastic Net Test RMSE: 0.7042

cat("Elastic Net Train R²:", round(enet_train_r2, 4), "\n")

Elastic Net Train R²: 0.6922

cat("Elastic Net Test R²:", round(enet_test_r2, 4), "\n")

Elastic Net Test R²: 0.5275

Elastic net - visualise

# Lambda selection
enet_cv_df <- data.frame(
  lambda = enet_cv$lambda,
  cvm = enet_cv$cvm,
  cvlo = enet_cv$cvlo,
  cvup = enet_cv$cvup
)

p_enet_cv <- ggplot(enet_cv_df, aes(x = log(lambda), y = cvm)) +
  geom_point(color = "red", size = 2) +
  geom_errorbar(aes(ymin = cvlo, ymax = cvup), width = 0.1, alpha = 0.5) +
  geom_vline(xintercept = log(enet_cv$lambda.min), linetype = "dashed", color = "blue") +
  geom_vline(xintercept = log(enet_cv$lambda.1se), linetype = "dashed", color = "darkgreen") +
  labs(
    title = "Elastic net: CV (alpha = 0.5)",
    x = "log(Lambda)",
    y = "Mean-squared error"
  ) +
  annotate("text",
    x = log(enet_cv$lambda.min), y = max(enet_cv_df$cvm),
    label = "lambda.min", hjust = -0.1, color = "blue"
  ) +
  theme_minimal(base_size = 12)

enet_full <- glmnet(x_train_scaled, y_train, alpha = 0.5)
enet_coef_df <- as.data.frame(as.matrix(t(enet_full$beta)))
enet_coef_df$lambda <- enet_full$lambda
enet_coef_long <- pivot_longer(enet_coef_df,
  cols = -lambda,
  names_to = "variable", values_to = "coefficient"
)

p_enet_path <- ggplot(enet_coef_long, aes(x = log(lambda), y = coefficient, color = variable)) +
  geom_line(linewidth = 1) +
  geom_vline(xintercept = log(enet_cv$lambda.min), linetype = "dashed", color = "red") +
  labs(
    title = "Elastic net: Coefficient paths",
    x = "log(Lambda)",
    y = "Coefficient value",
    color = "Variable"
  ) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "right")

p_enet_cv + p_enet_path

Figure 5: Elastic Net regression (α=0.5) combines ridge and lasso properties, providing both regularization and variable selection.

Model comparison

Performance metrics

comparison <- data.frame(
  Method = c("OLS", "Ridge", "Lasso", "Elastic Net",  "PCR"),
  Train_MSE = c(
    ols_train_mse, ridge_train_mse, lasso_train_mse, enet_train_mse,
    pcr_train_mse
  ),
  Test_MSE = c(
    ols_test_mse, ridge_test_mse, lasso_test_mse, enet_test_mse,
    pcr_test_mse
  ),
  Train_RMSE = c(
    ols_train_rmse, ridge_train_rmse, lasso_train_rmse, enet_train_rmse,
    pcr_train_rmse
  ),
  Test_RMSE = c(
    ols_test_rmse, ridge_test_rmse, lasso_test_rmse, enet_test_rmse,
    pcr_test_rmse
  ),
  Train_R2 = c(
    ols_train_r2, ridge_train_r2, lasso_train_r2, enet_train_r2,
    pcr_train_r2
  ),
  Test_R2 = c(
    ols_test_r2, ridge_test_r2, lasso_test_r2, enet_test_r2,
    pcr_test_r2
  )
)

comparison %>%
  mutate(across(where(is.numeric), ~ round(., 4))) %>%
  knitr::kable(col.names = c("Method", "Train MSE", "Test MSE", "Train RMSE", "Test RMSE", "Train R²", "Test R²"))

Table 1: Performance comparison of all methods including OLS baseline

Method	Train MSE	Test MSE	Train RMSE	Test RMSE	Train R²	Test R²
OLS	0.4392	0.5213	0.6627	0.7220	0.6944	0.5034
Ridge	0.4473	0.4944	0.6688	0.7031	0.6887	0.5290
Lasso	0.4428	0.4933	0.6654	0.7024	0.6919	0.5300
Elastic Net	0.4423	0.4960	0.6651	0.7042	0.6922	0.5275
PCR	0.4392	0.5213	0.6627	0.7220	0.6944	0.5034

RMSE comparison

rmse_comparison <- data.frame(
  Method = rep(c("OLS", "Ridge", "Lasso", "Elastic Net", "PCR"), 2),
  RMSE = c(comparison$Train_RMSE, comparison$Test_RMSE),
  Dataset = rep(c("Training", "Test"), each = 5)
)

rmse_comparison$Method <- factor(rmse_comparison$Method,
  levels = c("OLS", "Ridge", "Lasso", "Elastic Net",  "PCR")
)

p_rmse <- ggplot(rmse_comparison, aes(x = Method, y = RMSE, fill = Dataset)) +
  geom_col(position = "dodge", alpha = 0.8, width = 0.7) +
  geom_text(aes(label = sprintf("%.3f", RMSE)),
    position = position_dodge(width = 0.7),
    vjust = -0.5, size = 3.5, fontface = "bold"
  ) +
  scale_fill_manual(
    values = c("Training" = "#4472C4", "Test" = "#ED7D31"),
    name = "Dataset"
  ) +
  labs(
    title = "RMSE comparison",
    subtitle = "OLS (baseline) vs regularized regression methods",
    x = "Method",
    y = "RMSE (Root mean squared error)",
    caption = "Lower RMSE = Better performance | Test RMSE most important for generalization"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, face = "bold"),
    legend.position = "top",
    legend.title = element_text(face = "bold", size = 12),
    legend.text = element_text(size = 11),
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(size = 11, color = "gray30"),
    plot.caption = element_text(size = 9, color = "gray50", hjust = 0.5),
    panel.grid.major.x = element_blank()
  ) +
  ylim(0, max(rmse_comparison$RMSE) * 1.15)

p_rmse

cat("\n=== RMSE Summary ===\n")


=== RMSE Summary ===

cat(
  "Best training RMSE:", comparison$Method[which.min(comparison$Train_RMSE)],
  "=", round(min(comparison$Train_RMSE), 4), "\n"
)

Best training RMSE: OLS = 0.6627

cat(
  "Best test RMSE:", comparison$Method[which.min(comparison$Test_RMSE)],
  "=", round(min(comparison$Test_RMSE), 4), "\n"
)

Best test RMSE: Lasso = 0.7024

cat("\nTest RMSE improvement over OLS:\n")


Test RMSE improvement over OLS:

for (i in 2:nrow(comparison)) {
  improvement <- ((ols_test_rmse - comparison$Test_RMSE[i]) / ols_test_rmse) * 100
  cat(sprintf("  %s: %.2f%% improvement\n", comparison$Method[i], improvement))
}

  Ridge: 2.61% improvement
  Lasso: 2.72% improvement
  Elastic Net: 2.46% improvement
  PCR: 0.00% improvement

Performance vis

comparison$Method <- factor(comparison$Method, levels = comparison$Method)

# Test MSE comparison
p_mse <- ggplot(comparison, aes(x = Method, y = Test_MSE, fill = Method)) +
  geom_col(alpha = 0.8) +
  geom_text(aes(label = sprintf("%.4f", Test_MSE)), vjust = -0.5) +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title = "Model comparison: Test set MSE",
    x = "Method",
    y = "MSE"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

# Test R² comparison
p_r2 <- ggplot(comparison, aes(x = Method, y = Test_R2, fill = Method)) +
  geom_col(alpha = 0.8) +
  geom_text(aes(label = sprintf("%.4f", Test_R2)), vjust = -0.5) +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title = "Model Comparison: Test set R²",
    x = "Method",
    y = "R² (Coefficient of determination)"
  ) +
  ylim(0, max(comparison$Test_R2) * 1.15) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

p_mse + p_r2

Figure 7: Model comparison: Test set MSE and R² for all methods.

Predicted vs actual

pred_actual_df <- data.frame(
  Actual = rep(y_test, 4),
  Predicted = c(
    ridge_pred_test, lasso_pred_test, enet_pred_test,
    pcr_pred_test
  ),
  Method = rep(c("Ridge", "Lasso", "Elastic Net", "PCR"), each = length(y_test)),
  R2 = rep(c(ridge_test_r2, lasso_test_r2, enet_test_r2,  pcr_test_r2),
    each = length(y_test)
  )
)

ggplot(pred_actual_df, aes(x = Actual, y = Predicted)) +
  geom_point(aes(color = Method), size = 2, alpha = 0.7) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  geom_text(
    data = pred_actual_df %>%
      group_by(Method, R2) %>%
      summarise(x = min(Actual), y = max(Predicted), .groups = "drop"),
    aes(x = x, y = y, label = sprintf("R² = %.3f", R2)),
    hjust = 0, vjust = 1, size = 3
  ) +
  facet_wrap(~Method, ncol = 3) +
  labs(
    title = "Predicted vs actual log(PSA) for all methods",
    x = "Actual log(PSA)",
    y = "Predicted log(PSA)"
  ) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

Coefficient comparison

ridge_coef <- as.vector(coef(ridge_model))[-1]
lasso_coef <- as.vector(coef(lasso_model))[-1]
enet_coef <- as.vector(coef(enet_model))[-1]

coef_comparison <- data.frame(
  Variable = colnames(x_train),
  Ridge = ridge_coef,
  Lasso = lasso_coef,
  ElasticNet = enet_coef
)

coef_long <- pivot_longer(coef_comparison,
  cols = -Variable,
  names_to = "Method", values_to = "Coefficient"
)

ggplot(coef_long, aes(x = Variable, y = Coefficient, fill = Method)) +
  geom_col(position = "dodge", alpha = 0.8) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  scale_fill_brewer(palette = "Set1") +
  labs(
    title = "Coefficient comparison across methods",
    x = "Variable",
    y = "Coefficient value"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "top"
  )

Figure 9: Coefficient values across ridge, lasso, and elastic net. Note how Lasso sets some coefficients to exactly zero.

Session info

R version 4.5.2 (2025-10-31)
Platform: aarch64-apple-darwin20
Running under: macOS Tahoe 26.1

Matrix products: default
BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1

locale:
[1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8

time zone: Asia/Kolkata
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] dplyr_1.1.4     tidyr_1.3.1     reshape2_1.4.4  patchwork_1.3.2
[5] ggplot2_4.0.0   MASS_7.3-65     pls_2.8-5       glmnet_4.1-10  
[9] Matrix_1.7-4   

loaded via a namespace (and not attached):
 [1] gtable_0.3.6       jsonlite_2.0.0     compiler_4.5.2     tidyselect_1.2.1  
 [5] Rcpp_1.1.0         stringr_1.6.0      dichromat_2.0-0.1  splines_4.5.2     
 [9] scales_1.4.0       yaml_2.3.10        fastmap_1.2.0      lattice_0.22-7    
[13] plyr_1.8.9         R6_2.6.1           labeling_0.4.3     generics_0.1.4    
[17] shape_1.4.6.1      knitr_1.50         iterators_1.0.14   htmlwidgets_1.6.4 
[21] tibble_3.3.0       pillar_1.11.1      RColorBrewer_1.1-3 rlang_1.1.6       
[25] stringi_1.8.7      xfun_0.54          S7_0.2.0           cli_3.6.5         
[29] withr_3.0.2        magrittr_2.0.4     digest_0.6.37      foreach_1.5.2     
[33] grid_4.5.2         lifecycle_1.0.4    vctrs_0.6.5        evaluate_1.0.5    
[37] glue_1.8.0         farver_2.1.2       codetools_0.2-20   survival_3.8-3    
[41] purrr_1.2.0        rmarkdown_2.30     tools_4.5.2        pkgconfig_2.0.3   
[45] htmltools_0.5.8.1