NFHS5 PCA

suppressPackageStartupMessages({
  library(tidyverse)
  library(ggpubr)
  library(factoextra)
  library(kableExtra)
  library(ggpubr)
  library(ggrepel)
  library(viridis)
})

theme_set(theme_pubr())

Read data

# data credits:https://github.com/pratapvardhan/NFHS-5/
indicators <- read_csv("https://raw.githubusercontent.com/pratapvardhan/NFHS-5/refs/heads/master/NFHS-5-States.csv") %>%
  pull(indicator) %>%
  unique() %>%
  as.character() %>%
  sort()
Rows: 4847 Columns: 7
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): state, state_code, indicator
dbl (4): nfhs5_urban, nfhs5_rural, nfhs5_total, nfhs4_total

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
indicators.df <- data.frame(indicator = indicators, stringsAsFactors = F)
indicators.df$indicator_code <- paste0("ind", stringr::str_split_fixed(indicators.df$indicator, pattern = "\\.", n = 2)[, 1])
DT::datatable(indicators.df)

Preprocess dataframe

df <- read_csv("https://raw.githubusercontent.com/pratapvardhan/NFHS-5/refs/heads/master/NFHS-5-States.csv")
Rows: 4847 Columns: 7
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): state, state_code, indicator
dbl (4): nfhs5_urban, nfhs5_rural, nfhs5_total, nfhs4_total

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df$nfhs5_total <- as.numeric(df$nfhs5_total)
df <- left_join(df, indicators.df)
Joining with `by = join_by(indicator)`
df.total <- df %>%
  select(state, indicator_code, nfhs5_total) %>%
  drop_na()
head(df.total)
# A tibble: 6 × 3
  state indicator_code nfhs5_total
  <chr> <chr>                <dbl>
1 India ind1                  71.8
2 India ind2                  26.5
3 India ind3                1020  
4 India ind4                 929  
5 India ind5                  89.1
6 India ind6                  70.8
df.total.long <- df.total %>%
  pivot_wider(names_from = indicator_code, values_from = nfhs5_total) %>%
  as.data.frame()
rownames(df.total.long) <- df.total.long$state

na_cols <- df.total.long %>%
  summarise_all(funs(sum(is.na(.)))) %>%
  as.numeric()
Warning: `funs()` was deprecated in dplyr 0.8.0.
ℹ Please use a list of either functions or lambdas:

# Simple named list: list(mean = mean, median = median)

# Auto named with `tibble::lst()`: tibble::lst(mean, median)

# Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
df.total.long.noindia <- df.total.long %>% filter(state != "India")

df.total.long$state <- NULL
df.total.long.noindia$state <- NULL

na_cols <- df.total.long.noindia %>%
  summarise_all(funs(sum(is.na(.)))) %>%
  as.numeric()
Warning: `funs()` was deprecated in dplyr 0.8.0.
ℹ Please use a list of either functions or lambdas:

# Simple named list: list(mean = mean, median = median)

# Auto named with `tibble::lst()`: tibble::lst(mean, median)

# Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
df.total.long.noindia.nona <- df.total.long.noindia[, na_cols == 0]
DT::datatable(df.total.long.noindia.nona)

Perform PCA

pca <- prcomp(df.total.long.noindia.nona,
  scale = TRUE
)
summary(pca)
Importance of components:
                          PC1    PC2     PC3     PC4     PC5     PC6     PC7
Standard deviation     5.3462 4.3451 3.30448 2.76707 2.63619 2.16158 2.05119
Proportion of Variance 0.2485 0.1642 0.09495 0.06658 0.06043 0.04063 0.03659
Cumulative Proportion  0.2485 0.4127 0.50766 0.57424 0.63467 0.67530 0.71189
                           PC8    PC9    PC10   PC11    PC12    PC13    PC14
Standard deviation     1.97192 1.8481 1.73946 1.6085 1.54106 1.43664 1.37908
Proportion of Variance 0.03381 0.0297 0.02631 0.0225 0.02065 0.01795 0.01654
Cumulative Proportion  0.74570 0.7754 0.80171 0.8242 0.84486 0.86281 0.87935
                          PC15    PC16    PC17    PC18    PC19    PC20    PC21
Standard deviation     1.31581 1.28831 1.21075 1.12196 0.95072 0.93385 0.87917
Proportion of Variance 0.01506 0.01443 0.01275 0.01095 0.00786 0.00758 0.00672
Cumulative Proportion  0.89440 0.90883 0.92158 0.93253 0.94039 0.94797 0.95469
                          PC22    PC23    PC24    PC25    PC26    PC27   PC28
Standard deviation     0.85576 0.80438 0.76163 0.74050 0.71980 0.60236 0.5776
Proportion of Variance 0.00637 0.00563 0.00504 0.00477 0.00451 0.00316 0.0029
Cumulative Proportion  0.96106 0.96668 0.97173 0.97650 0.98100 0.98416 0.9871
                         PC29    PC30   PC31    PC32    PC33    PC34    PC35
Standard deviation     0.5575 0.52707 0.4913 0.44565 0.43720 0.40020 0.32928
Proportion of Variance 0.0027 0.00242 0.0021 0.00173 0.00166 0.00139 0.00094
Cumulative Proportion  0.9898 0.99218 0.9943 0.99600 0.99766 0.99906 1.00000
                            PC36
Standard deviation     1.146e-14
Proportion of Variance 0.000e+00
Cumulative Proportion  1.000e+00

How much variance is explained

plot(pca, type = "l")

percent.var.explained <- 100 * pca$sdev^2 / sum(pca$sdev^2)
barplot(percent.var.explained)

Plot PCA

df <- df.total.long.noindia.nona
pca <- prcomp(df, scale = TRUE)
df.pca <- as.data.frame(pca$x)
df.pca$state <- rownames(df.total.long.noindia.nona)
df.pca <- df.pca %>% select(state, PC1, PC2)
ggplot(df.pca, mapping = aes(x = PC1, y = PC2)) +
  geom_point() +
  geom_text_repel(aes(label = state), hjust = 0, vjust = 0) +
  xlab(paste0("PC1 (", round(percent.var.explained[1], 2), "%)")) +
  ylab(paste0("PC2 (", round(percent.var.explained[2], 2), "%)"))

Add metadata to states

states <- c(
  "Andaman & Nicobar Islands", "Andhra Pradesh", "Arunachal Pradesh", "Assam", "Bihar",
  "Chandigarh", "Chhattisgarh", "Dadra & Nagar Haveli and Daman & Diu", "NCT Delhi",
  "Goa", "Gujarat", "Himachal Pradesh", "Haryana", "Jharkhand", "Jammu & Kashmir",
  "Karnataka", "Kerala", "Lakshadweep", "Ladakh", "Maharashtra", "Meghalaya",
  "Manipur", "Madhya Pradesh", "Mizoram", "Nagaland", "Odisha", "Punjab", "Puducherry",
  "Rajasthan", "Sikkim", "Telangana", "Tamil Nadu", "Tripura", "Uttar Pradesh",
  "Uttarakhand", "West Bengal"
)

# mapping of states to regions
region_mapping <- list(
  "North" = c(
    "Chandigarh", "NCT Delhi", "Haryana", "Himachal Pradesh", "Jammu & Kashmir",
    "Ladakh", "Punjab", "Rajasthan", "Uttarakhand", "Uttar Pradesh"
  ),
  "South" = c("Andhra Pradesh", "Karnataka", "Kerala", "Puducherry", "Tamil Nadu", "Telangana"),
  "East" = c("Bihar", "Jharkhand", "Odisha", "West Bengal"),
  "West" = c("Goa", "Gujarat", "Maharashtra", "Rajasthan", "Dadra & Nagar Haveli and Daman & Diu"),
  "Northeast" = c(
    "Arunachal Pradesh", "Assam", "Manipur", "Meghalaya", "Mizoram",
    "Nagaland", "Sikkim", "Tripura"
  ),
  "Central" = c("Chhattisgarh", "Madhya Pradesh"),
  "Islands" = c("Andaman & Nicobar Islands", "Lakshadweep")
)

state_to_region <- unlist(lapply(names(region_mapping), function(region) {
  setNames(rep(region, length(region_mapping[[region]])), region_mapping[[region]])
}))

# Create a data frame
df.mapper <- data.frame(
  state = states,
  region = state_to_region[states],
  stringsAsFactors = FALSE
)


df.pca2 <- left_join(df.pca, df.mapper)
Joining with `by = join_by(state)`
ggplot(df.pca2, mapping = aes(x = PC1, y = PC2, color = region)) +
  geom_point() +
  scale_color_brewer(type = "qual", palette = "Dark2") +
  geom_text_repel(aes(label = state), show.legend = FALSE) +
  xlab(paste0("PC1 (", round(percent.var.explained[1], 2), "%)")) +
  ylab(paste0("PC2 (", round(percent.var.explained[2], 2), "%)"))

Correlations of PC1 with original variables

corrs <- cor(pca$x, df.total.long.noindia.nona) %>% as.data.frame()

pc1 <- corrs["PC1", ]
pc2 <- corrs["PC2", ]
pheatmap::pheatmap(corrs, fontsize = 4)

corrs.df <- as.data.frame(corrs)
corrs.df$PC <- rownames(corrs.df)
corrs.df <- reshape2::melt(corrs.df)
Using PC as id variables
corrs.df.pc1 <- corrs.df %>%
  filter(PC == "PC1") %>%
  arrange(desc((value)))
DT::datatable(corrs.df.pc1)
DT::datatable(corrs.df.pc1 %>% arrange(desc((value))) %>% head())
DT::datatable(corrs.df.pc1 %>% arrange(((value))) %>% head())

How does ind88 vary?

df.total.long.noindia.nona$state <- rownames(df.total.long.noindia.nona)
df.pca2 <- left_join(df.pca, df.total.long.noindia.nona)
Joining with `by = join_by(state)`
ggplot(df.pca2, mapping = aes(x = PC1, y = PC2, fill = ind88)) +
  geom_point(shape = 21, size = 2, color = "black") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "yellow", midpoint = median(df.pca2$ind88)) +
  geom_text_repel(aes(label = state), show.legend = FALSE) +
  xlab(paste0("PC1 (", round(percent.var.explained[1], 2), "%)")) +
  ylab(paste0("PC2 (", round(percent.var.explained[2], 2), "%)"))

How does ind2 vary?

ggplot(df.pca2, mapping = aes(x = PC1, y = PC2, fill = ind2)) +
  geom_point(shape = 21, size = 2, color = "black") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "yellow", midpoint = median(df.pca2$ind2)) +
  geom_text_repel(aes(label = state), show.legend = FALSE) +
  xlab(paste0("PC1 (", round(percent.var.explained[1], 2), "%)")) +
  ylab(paste0("PC2 (", round(percent.var.explained[2], 2), "%)"))