---
title: "Fitting Distributions"
format:
html:
code-tools: true
editor: visual
---
```{r}
library (tidyverse)
```
## Bernoulli
```{r}
?rbinom
```
```{r}
# simulate 100 bernoulli trials
set.seed (42 )
trials_025 <- rbinom (n = 100 , size = 1 , prob = 0.25 )
trials_025
```
How does the distribution look like?
```{r}
hist (trials_025)
```
## Varying p (or $\theta$)
```{r}
set.seed (42 )
ps <- c (0 , 0.25 , 0.5 , 0.75 )
data <- list ()
for (p in ps) {
trials <- rbinom (n = 100 , size = 1 , prob = p)
df <- data.frame (successes = trials)
df$ p <- p
data[[as.character (p)]] <- df
}
df_combined <- dplyr:: bind_rows (data)
ggplot (df_combined, aes (x = successes, fill = as.factor (p))) +
geom_histogram (position = "dodge" , bins = 10 ) +
ggpubr:: theme_pubr ()
```
## Estimating $\hat{theta}$
We observe $x_1, x_2, \dots, x_n$ outcomes from a Bernoulli trial and are interested in estimating the $\theta$ parameter.
$$
\begin{align*}
L(\theta) &= \prod_{i=1}^n \theta^{x_i}(1-\theta)^{(1-x_i)}\\
\ell(\theta) &= \log{\theta}\sum_{i=1}^n x_i + \log{(1-\theta)}\sum_{i=1}^n (1-x_i)\\
\end{align*}
$$
In maximum likelihood estimation (MLE), our goal is to estimate the value of $\theta$ such that the value of our likelihood function is maximized. More formally, in MLE we estimate $\hat{\theta}$ such that
\$\$\begin{align*}
\hat{\theta} &= \text{argmax} L(\theta)
\end{align*}
\begin{align*}
\dfrac{\partial\ell(\theta)}{\partial \theta} &= \dfrac{\sum_{i=1}^n x_i}{\theta} - \dfrac{\sum_{i=1}^n (1-x_i)}{1-\theta} \overset{\text{set}}{=}0\\
\sum_{i=1}^n x_i - \theta\sum_{i=1}^n x_i &= \theta\sum_{i=1}^n (1-x_i)\\
\hat{\theta}& = \dfrac{1}{n}\sum_{i=1}^n x_i\\
\dfrac{\partial^2 \ell(\theta)}{\partial \theta^2} &= \dfrac{-\sum_{i=1}^n x_i}{\theta^2} - \dfrac{\sum_{i=1}^n (1-x_i)}{(1-\theta)^2} < 0
\end{align*}\$\$
## Calculating likelihood
```{r}
trials_025[1 : 10 ]
```
```{r}
dbinom (x = trials_025, prob = 0.25 , size = 1 )[1 : 10 ]
```
## Calculating log-likelihood
```{r}
log_likelihood_bernoulli <- function (p, data) {
LL <- sum (dbinom (x = data, prob = p, size = 1 , log = TRUE ))
return (LL)
}
```
```{r}
log_likelihood_bernoulli (p = 0.5 , data = trials_025)
```
```{r}
log_likelihood_bernoulli (p = 0.25 , data = trials_025)
```
```{r}
log_likelihood_bernoulli (p = 0.1 , data = trials_025)
```
## Where does the LL get maximum?
```{r}
p <- seq (0 , 1 , 0.05 )
ll <- sapply (p, log_likelihood_bernoulli, data = trials_025)
df <- data.frame (p = p, ll = ll)
ggplot (df, aes (p, ll)) +
geom_point () +
theme_classic ()
```
```{r}
df %>% arrange (desc (ll))
```
# Binomial
```{r}
p <- 0.5
N <- 100
binom_05 <- dbinom (x = 0 : 100 , prob = p, size = N)
sum (binom_05)
```
```{r}
hist (binom_05)
```
```{r}
N <- 10
p_vec <- c (0.2 , 0.5 , 0.75 )
set.seed (42 )
binoms.list <- list ()
for (p in p_vec) {
binoms <- rbinom (n = 20 , prob = p, size = N)
df <- data.frame (binom = binoms)
df$ p <- p
binoms.list[[paste0 (p)]] <- df
}
binoms.df <- bind_rows (binoms.list)
binoms.df.freq <- binoms.df %>%
group_by (binom, p) %>%
summarise (Freq = n ())
ggplot (binoms.df.freq, aes (binom, Freq, fill = as.factor (p))) +
geom_bar (position = "dodge" , stat = "identity" ) +
theme_classic ()
```
```{r}
```
# Poisson
```{r}
lambdas <- c (1 , 4 , 10 , 15 )
set.seed (42 )
pois.data <- list ()
for (lambda in lambdas) {
pois <- rpois (n = 20 , lambda = lambda)
df <- table (pois) %>% as.data.frame ()
df <- data.frame (pois = pois)
df$ lambda <- lambda
pois.data[[paste0 (lambda)]] <- df
}
pois.df <- bind_rows (pois.data)
pois.df.freq <- pois.df %>%
group_by (pois, lambda) %>%
summarise (Freq = n ())
ggplot (pois.df.freq, aes (pois, Freq, fill = as.factor (lambda))) +
geom_bar (position = "dodge" , stat = "identity" ) +
theme_classic ()
```
```{r}
ggplot (pois.df, aes (pois, color = as.factor (lambda))) +
geom_density () +
ggpubr:: theme_pubr () +
scale_color_brewer (palette = "Set1" , name = "lambda" )
```
# Why is a poisson used for rare events?
```{r}
p <- 0.05
N <- 100
binoms <- rbinom (n = 20 , prob = p, size = N)
hist (binoms)
```
```{r}
rpois1 <- rpois (n = 20 , lambda = N* p )
hist (rpois1)
```
## Investigating fit using vcd
We can use [ vcd ](https://cran.r-project.org/web/packages/vcd/index.html) to visualize if the data follows a poisson distribution.
```{r}
library (vcd)
pois1 <- rpois (n = 3000 , lambda = 1 )
hist (pois1)
```
The rootogram shows square root frequency on the y-axis and the observed ( in green bar or predicted - in red) quantity on the x-axis
```{r}
pois1fit <- goodfit (x = pois1, type = "poisson" )
rootogram (pois1fit, xlab = "" , rect_gp = gpar (fill = "chartreuse4" ))
```