Lecture 9: PCA Step by Step - Visual Understanding

Author

DH302

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(patchwork)

set.seed(2)  
# Generate synthetic data with correlation
x <- 1:100
ex <- rnorm(100, 0, 25)  # gaussian blob for x
ey <- rnorm(100, 0, 25)  # gaussian blob for y
y <- 25 + 2 * x          # y is linearly related to x
x_obs <- x + ex          # add noise to x
y_obs <- y + ey          # add noise to y

# Create data frame for plotting
data_original <- data.frame(
  x = x_obs,
  y = y_obs,
  type = "Original Data"
)

Step 1: Original Data with Center Point

Let’s first look at our raw data and identify the center (mean) point:

# Calculate center point
center_x <- mean(x_obs)
center_y <- mean(y_obs)

p1 <- ggplot(data_original, aes(x = x, y = y)) +
  geom_point(alpha = 0.7, size = 2, color = "black") +
  geom_point(aes(x = center_x, y = center_y), 
             color = "red", size = 4, shape = 19) +
  labs(title = "Step 1: Original Data with Center Point",
       subtitle = paste0("Center: (", round(center_x, 1), ", ", round(center_y, 1), ")"),
       x = "X values", y = "Y values") +
  theme_minimal() +
  coord_fixed() +
  theme(plot.title = element_text(face = "bold"))

p1
Warning in geom_point(aes(x = center_x, y = center_y), color = "red", size = 4, : All aesthetics have length 1, but the data has 100 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
  a single row.

Step 2: Center the Data

PCA requires centered data (mean = 0). Let’s subtract the mean from each variable:

# Center the data
M <- cbind(x_obs - center_x, y_obs - center_y)
colnames(M) <- c("x_centered", "y_centered")

data_centered <- data.frame(
  x = M[, 1],
  y = M[, 2],
  type = "Centered Data"
)

p2 <- ggplot(data_centered, aes(x = x, y = y)) +
  geom_point(alpha = 0.7, size = 2, color = "blue") +
  geom_point(aes(x = 0, y = 0), color = "red", size = 4, shape = 19) +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
  geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.5) +
  labs(title = "Step 2: Centered Data",
       subtitle = "Data now centered at origin (0,0)",
       x = "X centered", y = "Y centered") +
  theme_minimal() +
  coord_fixed() +
  theme(plot.title = element_text(face = "bold"))

p2
Warning in geom_point(aes(x = 0, y = 0), color = "red", size = 4, shape = 19): All aesthetics have length 1, but the data has 100 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
  a single row.

Step 3: Calculate Covariance Matrix and Eigenanalysis

Now we compute the covariance matrix and find its eigenvalues and eigenvectors:

# Calculate covariance matrix
MCov <- cov(M)
print("Covariance Matrix:")
[1] "Covariance Matrix:"
print(round(MCov, 2))
           x_centered y_centered
x_centered    1462.16    1271.10
y_centered    1271.10    3366.19
# Compute eigenvalues and eigenvectors
eigen_result <- eigen(MCov)
eigenValues <- eigen_result$values
eigenVectors <- eigen_result$vectors

# Alternatively we can calcucate svd of the original central matrix
d <- svd(M)$d   # singular values
v <- svd(M)$v   # right singular values


# Calculate proportion of variance explained
var_explained <- eigenValues / sum(eigenValues)
print("Proportion of variance explained by each PC:")
[1] "Proportion of variance explained by each PC:"
print(paste("PC1:", round(var_explained[1] * 100, 1), "%"))
[1] "PC1: 82.9 %"
print(paste("PC2:", round(var_explained[2] * 100, 1), "%"))
[1] "PC2: 17.1 %"
# Show that eigeValues of MCov and singular vlaues of M are same

print(sum(eigenValues-d))
[1] 3912.914
# Show that eigenVectors of MCov and right singular vlaues of M are same

print(sum(eigenVectors-v))
[1] 1.110223e-16

Step 4: Visualize Principal Component Directions

Let’s add the principal component axes to our centered data:

# Create lines for principal components
# PC1 direction (first eigenvector)
pc1_slope <- eigenVectors[2, 1] / eigenVectors[1, 1]
# PC2 direction (second eigenvector) 
pc2_slope <- eigenVectors[2, 2] / eigenVectors[1, 2]

# Create line segments for visualization
line_length <- 50  # Extend lines in both directions

# PC1 line points
pc1_x <- c(-line_length, line_length)
pc1_y <- pc1_slope * pc1_x

# PC2 line points  
pc2_x <- c(-line_length, line_length)
pc2_y <- pc2_slope * pc2_x

# Plot with principal component axes
p3 <- ggplot(data_centered, aes(x = x, y = y)) +
  geom_point(alpha = 0.7, size = 2, color = "blue") +
  geom_point(aes(x = 0, y = 0), color = "red", size = 4, shape = 19) +
  geom_line(data = data.frame(x = pc1_x, y = pc1_y), 
            aes(x = x, y = y), color = "green", size = 1.2, alpha = 0.8) +
  geom_line(data = data.frame(x = pc2_x, y = pc2_y), 
            aes(x = x, y = y), color = "orange", size = 1.2, alpha = 0.8) +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.3) +
  geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.3) +
  labs(title = "Step 4: Principal Component Axes",
       subtitle = "Green = PC1 (max variance), Orange = PC2 (orthogonal)",
       x = "X centered", y = "Y centered") +
  theme_minimal() +
  coord_fixed(xlim = c(-80, 80), ylim = c(-80, 80)) +
  theme(plot.title = element_text(face = "bold"))
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
p3
Warning in geom_point(aes(x = 0, y = 0), color = "red", size = 4, shape = 19): All aesthetics have length 1, but the data has 100 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
  a single row.

print(paste("PC1 explains", round(var_explained[1] * 100, 1), "% of variance"))
[1] "PC1 explains 82.9 % of variance"
print(paste("PC2 explains", round(var_explained[2] * 100, 1), "% of variance"))
[1] "PC2 explains 17.1 % of variance"

Step 5: Project Data onto First Principal Component

# Project data onto first principal component
# Projection formula: (M · v1) * v1
pc1_vector <- eigenVectors[, 1]
projections_pc1 <- (M %*% pc1_vector) %*% t(pc1_vector)

# Create data frame with original points and projections
data_with_projections <- data.frame(
  x_orig = M[, 1],
  y_orig = M[, 2],
  x_proj = projections_pc1[, 1],
  y_proj = projections_pc1[, 2]
)

# Plot with projections
p4 <- ggplot() +
  # Original points
  geom_point(data = data_with_projections, 
             aes(x = x_orig, y = y_orig), 
             alpha = 0.7, size = 2, color = "blue") +
  # Projected points
  geom_point(data = data_with_projections, 
             aes(x = x_proj, y = y_proj), 
             color = "purple", size = 2, alpha = 0.8) +
  # Connection lines
  geom_segment(data = data_with_projections,
               aes(x = x_orig, y = y_orig, xend = x_proj, yend = y_proj),
               color = "purple", alpha = 0.5, linetype = "dashed") +
  # Principal component axes
  geom_line(data = data.frame(x = pc1_x, y = pc1_y), 
            aes(x = x, y = y), color = "green", size = 1.2, alpha = 0.8) +
  geom_line(data = data.frame(x = pc2_x, y = pc2_y), 
            aes(x = x, y = y), color = "orange", size = 1.2, alpha = 0.8) +
  geom_point(aes(x = 0, y = 0), color = "red", size = 4, shape = 19) +
  labs(title = "Step 5: Projections onto First Principal Component",
       subtitle = "Purple dots = projections, dashed lines = distance to PC1",
       x = "X centered", y = "Y centered") +
  theme_minimal() +
  coord_fixed(xlim = c(-80, 80), ylim = c(-80, 80)) +
  theme(plot.title = element_text(face = "bold"))

print(p4)

Step 6: Transform to Principal Component Coordinates

Finally, let’s transform our data to the principal component coordinate system:

# Transform data to PC coordinates
# PC scores = M * eigenvectors
pc_scores <- M %*% eigenVectors
colnames(pc_scores) <- c("PC1", "PC2")

# Create data frame for PC space
data_pc_space <- data.frame(
  PC1 = pc_scores[, 1],
  PC2 = pc_scores[, 2]
)

# Plot in PC coordinate system
p5 <- ggplot(data_pc_space, aes(x = PC1, y = PC2)) +
  geom_point(alpha = 0.7, size = 2, color = "darkgreen") +
  geom_point(aes(x = 0, y = 0), color = "red", size = 4, shape = 19) +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
  geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.5) +
  labs(title = "Step 6: Data in Principal Component Space",
       subtitle = paste0("PC1 (", round(var_explained[1]*100, 1), "% var) vs PC2 (", round(var_explained[2]*100, 1), "% var)"),
       x = paste0("PC1 (", round(var_explained[1]*100, 1), "% of variance)"), 
       y = paste0("PC2 (", round(var_explained[2]*100, 1), "% of variance)")) +
  theme_minimal() +
  coord_fixed() +
  theme(plot.title = element_text(face = "bold"))

print(p5)
Warning in geom_point(aes(x = 0, y = 0), color = "red", size = 4, shape = 19): All aesthetics have length 1, but the data has 100 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
  a single row.

print("Standard deviations of principal components:")
[1] "Standard deviations of principal components:"
print(paste("PC1:", round(sqrt(eigenValues[1]), 2)))
[1] "PC1: 63.26"
print(paste("PC2:", round(sqrt(eigenValues[2]), 2)))
[1] "PC2: 28.74"

Step 7: Comparison - Original vs Principal Component Space

Let’s compare the original data with the transformed data side by side:

# Side by side comparison
p_original <- ggplot(data_original, aes(x = x, y = y)) +
  geom_point(alpha = 0.7, size = 2, color = "black") +
  labs(title = "Original Data Space",
       x = "X", y = "Y") +
  theme_minimal() +
  coord_fixed()

p_transformed <- ggplot(data_pc_space, aes(x = PC1, y = PC2)) +
  geom_point(alpha = 0.7, size = 2, color = "darkgreen") +
  labs(title = "Principal Component Space",
       x = paste0("PC1 (", round(var_explained[1]*100, 1), "%)"), 
       y = paste0("PC2 (", round(var_explained[2]*100, 1), "%)")) +
  theme_minimal() +
  coord_fixed()

combined_plot <- p_original + p_transformed
print(combined_plot)

Verification with R’s prcomp()

Let’s verify our calculations using R’s built-in PCA function:

# Use prcomp for verification
pca_result <- prcomp(cbind(x_obs, y_obs), center = TRUE, scale = FALSE)

print("Our eigenvalues vs prcomp standard deviations squared:")
[1] "Our eigenvalues vs prcomp standard deviations squared:"
print(data.frame(
  Our_eigenvalues = round(eigenValues, 4),
  prcomp_sdev_squared = round(pca_result$sdev^2, 4),
  Difference = round(abs(eigenValues - pca_result$sdev^2), 6)
))
  Our_eigenvalues prcomp_sdev_squared Difference
1       4002.2695           4002.2695          0
2        826.0842            826.0842          0
print("Our PC scores vs prcomp scores (first 5 rows):")
[1] "Our PC scores vs prcomp scores (first 5 rows):"
our_scores <- pc_scores[1:5, ]
prcomp_scores <- pca_result$x[1:5, ]
print("Our scores:")
[1] "Our scores:"
print(round(our_scores, 4))
           PC1      PC2
[1,]  -97.0078  31.0233
[2,] -100.8648  -2.2662
[3,]  -95.7847 -40.0637
[4,] -133.6927  15.8376
[5,] -122.2248  -8.8986
print("prcomp scores:")  
[1] "prcomp scores:"
print(round(prcomp_scores, 4))
           PC1      PC2
[1,]  -97.0078  31.0233
[2,] -100.8648  -2.2662
[3,]  -95.7847 -40.0637
[4,] -133.6927  15.8376
[5,] -122.2248  -8.8986