suppressPackageStartupMessages({
library(tidyverse)
library(ggpubr)
library(factoextra)
library(kableExtra)
library(ggpubr)
library(ggrepel)
library(viridis)
})
theme_set(theme_pubr())
NFHS5 PCA
Read data
# data credits:https://github.com/pratapvardhan/NFHS-5/
<- read_csv("https://raw.githubusercontent.com/pratapvardhan/NFHS-5/refs/heads/master/NFHS-5-States.csv") %>%
indicators pull(indicator) %>%
unique() %>%
as.character() %>%
sort()
Rows: 4847 Columns: 7
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): state, state_code, indicator
dbl (4): nfhs5_urban, nfhs5_rural, nfhs5_total, nfhs4_total
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- data.frame(indicator = indicators, stringsAsFactors = F)
indicators.df $indicator_code <- paste0("ind", stringr::str_split_fixed(indicators.df$indicator, pattern = "\\.", n = 2)[, 1])
indicators.df::datatable(indicators.df) DT
Preprocess dataframe
<- read_csv("https://raw.githubusercontent.com/pratapvardhan/NFHS-5/refs/heads/master/NFHS-5-States.csv") df
Rows: 4847 Columns: 7
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): state, state_code, indicator
dbl (4): nfhs5_urban, nfhs5_rural, nfhs5_total, nfhs4_total
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
$nfhs5_total <- as.numeric(df$nfhs5_total)
df<- left_join(df, indicators.df) df
Joining with `by = join_by(indicator)`
<- df %>%
df.total select(state, indicator_code, nfhs5_total) %>%
drop_na()
head(df.total)
# A tibble: 6 × 3
state indicator_code nfhs5_total
<chr> <chr> <dbl>
1 India ind1 71.8
2 India ind2 26.5
3 India ind3 1020
4 India ind4 929
5 India ind5 89.1
6 India ind6 70.8
<- df.total %>%
df.total.long pivot_wider(names_from = indicator_code, values_from = nfhs5_total) %>%
as.data.frame()
rownames(df.total.long) <- df.total.long$state
<- df.total.long %>%
na_cols summarise_all(funs(sum(is.na(.)))) %>%
as.numeric()
Warning: `funs()` was deprecated in dplyr 0.8.0.
ℹ Please use a list of either functions or lambdas:
# Simple named list: list(mean = mean, median = median)
# Auto named with `tibble::lst()`: tibble::lst(mean, median)
# Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
<- df.total.long %>% filter(state != "India")
df.total.long.noindia
$state <- NULL
df.total.long$state <- NULL
df.total.long.noindia
<- df.total.long.noindia %>%
na_cols summarise_all(funs(sum(is.na(.)))) %>%
as.numeric()
Warning: `funs()` was deprecated in dplyr 0.8.0.
ℹ Please use a list of either functions or lambdas:
# Simple named list: list(mean = mean, median = median)
# Auto named with `tibble::lst()`: tibble::lst(mean, median)
# Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
<- df.total.long.noindia[, na_cols == 0]
df.total.long.noindia.nona ::datatable(df.total.long.noindia.nona) DT
Perform PCA
<- prcomp(df.total.long.noindia.nona,
pca scale = TRUE
)summary(pca)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7
Standard deviation 5.3462 4.3451 3.30448 2.76707 2.63619 2.16158 2.05119
Proportion of Variance 0.2485 0.1642 0.09495 0.06658 0.06043 0.04063 0.03659
Cumulative Proportion 0.2485 0.4127 0.50766 0.57424 0.63467 0.67530 0.71189
PC8 PC9 PC10 PC11 PC12 PC13 PC14
Standard deviation 1.97192 1.8481 1.73946 1.6085 1.54106 1.43664 1.37908
Proportion of Variance 0.03381 0.0297 0.02631 0.0225 0.02065 0.01795 0.01654
Cumulative Proportion 0.74570 0.7754 0.80171 0.8242 0.84486 0.86281 0.87935
PC15 PC16 PC17 PC18 PC19 PC20 PC21
Standard deviation 1.31581 1.28831 1.21075 1.12196 0.95072 0.93385 0.87917
Proportion of Variance 0.01506 0.01443 0.01275 0.01095 0.00786 0.00758 0.00672
Cumulative Proportion 0.89440 0.90883 0.92158 0.93253 0.94039 0.94797 0.95469
PC22 PC23 PC24 PC25 PC26 PC27 PC28
Standard deviation 0.85576 0.80438 0.76163 0.74050 0.71980 0.60236 0.5776
Proportion of Variance 0.00637 0.00563 0.00504 0.00477 0.00451 0.00316 0.0029
Cumulative Proportion 0.96106 0.96668 0.97173 0.97650 0.98100 0.98416 0.9871
PC29 PC30 PC31 PC32 PC33 PC34 PC35
Standard deviation 0.5575 0.52707 0.4913 0.44565 0.43720 0.40020 0.32928
Proportion of Variance 0.0027 0.00242 0.0021 0.00173 0.00166 0.00139 0.00094
Cumulative Proportion 0.9898 0.99218 0.9943 0.99600 0.99766 0.99906 1.00000
PC36
Standard deviation 1.146e-14
Proportion of Variance 0.000e+00
Cumulative Proportion 1.000e+00
How much variance is explained
plot(pca, type = "l")
<- 100 * pca$sdev^2 / sum(pca$sdev^2)
percent.var.explained barplot(percent.var.explained)
Plot PCA
<- df.total.long.noindia.nona
df <- prcomp(df, scale = TRUE)
pca <- as.data.frame(pca$x)
df.pca $state <- rownames(df.total.long.noindia.nona)
df.pca<- df.pca %>% select(state, PC1, PC2)
df.pca ggplot(df.pca, mapping = aes(x = PC1, y = PC2)) +
geom_point() +
geom_text_repel(aes(label = state), hjust = 0, vjust = 0) +
xlab(paste0("PC1 (", round(percent.var.explained[1], 2), "%)")) +
ylab(paste0("PC2 (", round(percent.var.explained[2], 2), "%)"))
Add metadata to states
<- c(
states "Andaman & Nicobar Islands", "Andhra Pradesh", "Arunachal Pradesh", "Assam", "Bihar",
"Chandigarh", "Chhattisgarh", "Dadra & Nagar Haveli and Daman & Diu", "NCT Delhi",
"Goa", "Gujarat", "Himachal Pradesh", "Haryana", "Jharkhand", "Jammu & Kashmir",
"Karnataka", "Kerala", "Lakshadweep", "Ladakh", "Maharashtra", "Meghalaya",
"Manipur", "Madhya Pradesh", "Mizoram", "Nagaland", "Odisha", "Punjab", "Puducherry",
"Rajasthan", "Sikkim", "Telangana", "Tamil Nadu", "Tripura", "Uttar Pradesh",
"Uttarakhand", "West Bengal"
)
# mapping of states to regions
<- list(
region_mapping "North" = c(
"Chandigarh", "NCT Delhi", "Haryana", "Himachal Pradesh", "Jammu & Kashmir",
"Ladakh", "Punjab", "Rajasthan", "Uttarakhand", "Uttar Pradesh"
),"South" = c("Andhra Pradesh", "Karnataka", "Kerala", "Puducherry", "Tamil Nadu", "Telangana"),
"East" = c("Bihar", "Jharkhand", "Odisha", "West Bengal"),
"West" = c("Goa", "Gujarat", "Maharashtra", "Rajasthan", "Dadra & Nagar Haveli and Daman & Diu"),
"Northeast" = c(
"Arunachal Pradesh", "Assam", "Manipur", "Meghalaya", "Mizoram",
"Nagaland", "Sikkim", "Tripura"
),"Central" = c("Chhattisgarh", "Madhya Pradesh"),
"Islands" = c("Andaman & Nicobar Islands", "Lakshadweep")
)
<- unlist(lapply(names(region_mapping), function(region) {
state_to_region setNames(rep(region, length(region_mapping[[region]])), region_mapping[[region]])
}))
# Create a data frame
<- data.frame(
df.mapper state = states,
region = state_to_region[states],
stringsAsFactors = FALSE
)
<- left_join(df.pca, df.mapper) df.pca2
Joining with `by = join_by(state)`
ggplot(df.pca2, mapping = aes(x = PC1, y = PC2, color = region)) +
geom_point() +
scale_color_brewer(type = "qual", palette = "Dark2") +
geom_text_repel(aes(label = state), show.legend = FALSE) +
xlab(paste0("PC1 (", round(percent.var.explained[1], 2), "%)")) +
ylab(paste0("PC2 (", round(percent.var.explained[2], 2), "%)"))
Correlations of PC1 with original variables
<- cor(pca$x, df.total.long.noindia.nona) %>% as.data.frame()
corrs
<- corrs["PC1", ]
pc1 <- corrs["PC2", ]
pc2 ::pheatmap(corrs, fontsize = 4) pheatmap
<- as.data.frame(corrs)
corrs.df $PC <- rownames(corrs.df)
corrs.df<- reshape2::melt(corrs.df) corrs.df
Using PC as id variables
<- corrs.df %>%
corrs.df.pc1 filter(PC == "PC1") %>%
arrange(desc((value)))
::datatable(corrs.df.pc1) DT
::datatable(corrs.df.pc1 %>% arrange(desc((value))) %>% head()) DT
::datatable(corrs.df.pc1 %>% arrange(((value))) %>% head()) DT
How does ind88 vary?
$state <- rownames(df.total.long.noindia.nona)
df.total.long.noindia.nona<- left_join(df.pca, df.total.long.noindia.nona) df.pca2
Joining with `by = join_by(state)`
ggplot(df.pca2, mapping = aes(x = PC1, y = PC2, fill = ind88)) +
geom_point(shape = 21, size = 2, color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "yellow", midpoint = median(df.pca2$ind88)) +
geom_text_repel(aes(label = state), show.legend = FALSE) +
xlab(paste0("PC1 (", round(percent.var.explained[1], 2), "%)")) +
ylab(paste0("PC2 (", round(percent.var.explained[2], 2), "%)"))
How does ind2 vary?
ggplot(df.pca2, mapping = aes(x = PC1, y = PC2, fill = ind2)) +
geom_point(shape = 21, size = 2, color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "yellow", midpoint = median(df.pca2$ind2)) +
geom_text_repel(aes(label = state), show.legend = FALSE) +
xlab(paste0("PC1 (", round(percent.var.explained[1], 2), "%)")) +
ylab(paste0("PC2 (", round(percent.var.explained[2], 2), "%)"))