Lecture06-attendance

library(tidyverse)
library(MASS)        
library(foreign)    
library(ggplot2)
library(ggpubr)
theme_set(theme_pubr())



url <- "https://stats.idre.ucla.edu/stat/stata/dae/nb_data.dta"
dat <- read.dta(url)

# attendance data of 314 high school juniors from two urban high schools

# Response variable = days absent: `daysabs`. 
# Variable math is the standardized math score for each student.
# Variable prog is a three-level nominal variable indicating the type of instructional program in which the student is enrolled.



dat <- within(dat, {
  prog <- factor(prog, levels = 1:3, labels = c("General", "Academic", "Vocational"))
  id <- factor(id)
})
head(dat)

Goal

We are interested in finding if the attendance behavior is different across the three types of programs (General, Academic, Vocational) and how it relates to the math scores of the students.

What should I do here?

hist(dat$daysabs)
ggplot(data = dat, aes(x = daysabs, fill = prog)) +
    geom_boxplot()
ggplot(data = dat, aes(x = daysabs, fill = gender)) +
    geom_boxplot()
sum(is.na(dat$daysabs))
sum(is.na(dat$math))
table(dat$gender)
ggplot(dat) + geom_point(aes(x = math, y = daysabs, 
                             shape = gender,
                             color = prog), alpha = 0.9) +
  labs(title = "Math Score vs Days Absent",
       x = "Math Score",
       y = "Days Absent",
       color = "Program Type") +
  theme_minimal()
lm.fit <- lm(daysabs ~  math, data = dat)

lm.fit2 <- lm(sqrt(daysabs) ~  math, data = dat)


preds <- data.frame(math = dat$math,
            daysabs = predict(lm.fit))
preds2 <- data.frame(math = dat$math,
            daysabs = predict(lm.fit2)^2)

ggplot(dat) + geom_point(aes(x = math, y = daysabs, 
                             shape = gender,
                             color = prog), alpha = 0.9) +
    geom_line(data = preds, aes(x = math, y = daysabs), color = "black") +
    geom_line(data = preds2, aes(x = math, y = daysabs), color = "red") 
library(fitdistrplus)
z <- fitdist(dat$daysabs, "pois")
hist(dat$daysabs, breaks = 30, probability = TRUE, main = "Histogram of Days Absent")
hist(dpois(seq(0, max(dat$daysabs), by = 1), lambda = z$estimate), col = "blue", lwd = 2)
summary(lm.fit)
cor(dat$daysabs, dat$math )^2

Model fitting

model_linear <- lm(daysabs ~ math + prog, data = dat)

model_poisson <- glm(daysabs ~ math + prog, 
                     family = poisson(link = "log"), data = dat)

model_nb <- glm.nb(daysabs ~ math + prog, data = dat)

Which model is good among the ones?

AIC(model_linear, model_poisson, model_nb)

But why is it good?

math_range <- seq(from = min(dat$math), to = max(dat$math), length.out = 100)
programs <- c("General", "Academic", "Vocational")

pred_data <- expand.grid(
  math = math_range,
  prog = factor(programs, levels = c("General", "Academic", "Vocational"))
)
pred_data$linear <- predict(model_linear, newdata = pred_data)

pred_data$poisson <- predict(model_poisson, newdata = pred_data, type = "response")

pred_data$nb <- predict(model_nb, newdata = pred_data, type = "response")

Are there any negative predictions?

negative_preds <- sum(pred_data$linear < 0)
negative_preds
ggplot() +
  geom_point(data = dat, aes(x = math, y = daysabs, color = prog), 
             alpha = 0.6, size = 1.5) +
  
  geom_line(data = pred_data, aes(x = math, y = linear, color = prog), 
            linetype = "dashed", size = 1.2, alpha = 0.8) +
  
  geom_line(data = pred_data, aes(x = math, y = poisson, color = prog), 
            linetype = "solid", size = 1.2) +
  
  geom_hline(yintercept = 0, color = "black", linetype = "dotted", alpha = 0.7) +
    labs(title = "LM vs GLM",
       subtitle = "Dashed = Linear Regression, Solid = Poisson GLM",
       x = "Math Score", 
       y = "Predicted Days Absent",
       color = "Program Type")
ggplot() +
  geom_point(data = dat, aes(x = math, y = daysabs, color = prog), 
             alpha = 0.6, size = 1.5) +
  
  geom_line(data = pred_data, aes(x = math, y = linear, color = prog), 
            linetype = "dashed", size = 1.2, alpha = 0.8) +
  
  #geom_line(data = pred_data, aes(x = math, y = poisson, color = prog), 
  #          linetype = "solid", size = 1.2) +
  
  geom_line(data = pred_data, aes(x = math, y = nb, color = prog), 
            linetype = "dotdash", size = 1.2, alpha = 0.9) +
  
  geom_hline(yintercept = 0, color = "black", linetype = "dotted", alpha = 0.7) +
  
  labs(title = "LM vs Poisson vs Negative Binomial GLM",
       subtitle = "Dashed = Linear, Solid = Poisson, Dot-dash = Negative Binomial",
       x = "Math Score", 
       y = "Predicted Days Absent",
       color = "Program Type") +
  
  scale_color_manual(values = c("General" = "#E31A1C", 
                               "Academic" = "#1F78B4", 
                               "Vocational" = "#33A02C")) +
  theme_minimal()