library(tidyverse)
library(MASS)
library(foreign)
library(ggplot2)
library(ggpubr)
theme_set(theme_pubr())
url <- "https://stats.idre.ucla.edu/stat/stata/dae/nb_data.dta"
dat <- read.dta(url)
# attendance data of 314 high school juniors from two urban high schools
# Response variable = days absent: `daysabs`.
# Variable math is the standardized math score for each student.
# Variable prog is a three-level nominal variable indicating the type of instructional program in which the student is enrolled.
dat <- within(dat, {
prog <- factor(prog, levels = 1:3, labels = c("General", "Academic", "Vocational"))
id <- factor(id)
})Lecture06-attendance
head(dat)Goal
We are interested in finding if the attendance behavior is different across the three types of programs (General, Academic, Vocational) and how it relates to the math scores of the students.
What should I do here?
hist(dat$daysabs)ggplot(data = dat, aes(x = daysabs, fill = prog)) +
geom_boxplot()ggplot(data = dat, aes(x = daysabs, fill = gender)) +
geom_boxplot()sum(is.na(dat$daysabs))sum(is.na(dat$math))table(dat$gender)ggplot(dat) + geom_point(aes(x = math, y = daysabs,
shape = gender,
color = prog), alpha = 0.9) +
labs(title = "Math Score vs Days Absent",
x = "Math Score",
y = "Days Absent",
color = "Program Type") +
theme_minimal()lm.fit <- lm(daysabs ~ math, data = dat)
lm.fit2 <- lm(sqrt(daysabs) ~ math, data = dat)
preds <- data.frame(math = dat$math,
daysabs = predict(lm.fit))
preds2 <- data.frame(math = dat$math,
daysabs = predict(lm.fit2)^2)
ggplot(dat) + geom_point(aes(x = math, y = daysabs,
shape = gender,
color = prog), alpha = 0.9) +
geom_line(data = preds, aes(x = math, y = daysabs), color = "black") +
geom_line(data = preds2, aes(x = math, y = daysabs), color = "red") library(fitdistrplus)
z <- fitdist(dat$daysabs, "pois")
hist(dat$daysabs, breaks = 30, probability = TRUE, main = "Histogram of Days Absent")
hist(dpois(seq(0, max(dat$daysabs), by = 1), lambda = z$estimate), col = "blue", lwd = 2)summary(lm.fit)cor(dat$daysabs, dat$math )^2Model fitting
model_linear <- lm(daysabs ~ math + prog, data = dat)
model_poisson <- glm(daysabs ~ math + prog,
family = poisson(link = "log"), data = dat)
model_nb <- glm.nb(daysabs ~ math + prog, data = dat)Which model is good among the ones?
AIC(model_linear, model_poisson, model_nb)But why is it good?
math_range <- seq(from = min(dat$math), to = max(dat$math), length.out = 100)
programs <- c("General", "Academic", "Vocational")
pred_data <- expand.grid(
math = math_range,
prog = factor(programs, levels = c("General", "Academic", "Vocational"))
)pred_data$linear <- predict(model_linear, newdata = pred_data)
pred_data$poisson <- predict(model_poisson, newdata = pred_data, type = "response")
pred_data$nb <- predict(model_nb, newdata = pred_data, type = "response")Are there any negative predictions?
negative_preds <- sum(pred_data$linear < 0)
negative_predsggplot() +
geom_point(data = dat, aes(x = math, y = daysabs, color = prog),
alpha = 0.6, size = 1.5) +
geom_line(data = pred_data, aes(x = math, y = linear, color = prog),
linetype = "dashed", size = 1.2, alpha = 0.8) +
geom_line(data = pred_data, aes(x = math, y = poisson, color = prog),
linetype = "solid", size = 1.2) +
geom_hline(yintercept = 0, color = "black", linetype = "dotted", alpha = 0.7) +
labs(title = "LM vs GLM",
subtitle = "Dashed = Linear Regression, Solid = Poisson GLM",
x = "Math Score",
y = "Predicted Days Absent",
color = "Program Type")ggplot() +
geom_point(data = dat, aes(x = math, y = daysabs, color = prog),
alpha = 0.6, size = 1.5) +
geom_line(data = pred_data, aes(x = math, y = linear, color = prog),
linetype = "dashed", size = 1.2, alpha = 0.8) +
#geom_line(data = pred_data, aes(x = math, y = poisson, color = prog),
# linetype = "solid", size = 1.2) +
geom_line(data = pred_data, aes(x = math, y = nb, color = prog),
linetype = "dotdash", size = 1.2, alpha = 0.9) +
geom_hline(yintercept = 0, color = "black", linetype = "dotted", alpha = 0.7) +
labs(title = "LM vs Poisson vs Negative Binomial GLM",
subtitle = "Dashed = Linear, Solid = Poisson, Dot-dash = Negative Binomial",
x = "Math Score",
y = "Predicted Days Absent",
color = "Program Type") +
scale_color_manual(values = c("General" = "#E31A1C",
"Academic" = "#1F78B4",
"Vocational" = "#33A02C")) +
theme_minimal()