Expectation Maximisation with Python : Coin Toss¶
This notebook implements the example, I consider a classic for understanding Expectation Maximisation.
See: http://www.nature.com/nbt/journal/v26/n8/full/nbt1406.html
Notations:
\begin{align*} \theta_A &= \text{Probability of a Heads showing up given the coin tossed is A}\\ \theta_B &= \text{Probability of a Heads showing up given the coin tossed is B}\\ \end{align*}%matplotlib notebook
from __future__ import division
from collections import OrderedDict
from scipy.stats import binom as binomial
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#from ipywidgets import StaticInteract, RangeWidget
import pandas as pd
from IPython.display import display, Image
from scipy.spatial.distance import euclidean
from sympy import init_printing, symbols, Eq
init_printing()
Image('images/nbt1406-F1.png')
coin_toss = []
coin_toss.append('H T T T H H T H T H'.split())
coin_toss.append('H H H H T H H H H H'.split())
coin_toss.append('H T H H H H H T H H'.split())
coin_toss.append('H T H T T T H H T T'.split())
coin_toss.append('T H H H T H H H T H'.split())
columns = range(1,11)
df = pd.DataFrame(coin_toss, index=None, columns=columns)
df.index.rename('Toss')
Our configuration looks like this:
df
Case 1: Identity of coin being tossed known¶
If the identity of the coin being tossed is known and is observed = ['B', 'A', 'A', 'B', 'A']
it is not so difficult to calculate the corresponding values of $\theta_A$ and $\theta_B$:
thetaA, thetaB = symbols('theta_A theta_B')
a,b = thetaA, thetaB # Hack to display
## Observed Case
observed = ['B', 'A', 'A', 'B', 'A']
index_A = [i for i,x in enumerate(observed) if x=='A']
index_B = [i for i,x in enumerate(observed) if x=='B']
total_tosses = df.size
A_tosses = df.iloc[index_A].unstack()
B_tosses = df.iloc[index_B].unstack()
A_heads = A_tosses.value_counts()['H']
B_heads = B_tosses.value_counts()['H']
theta_A = A_heads/A_tosses.size
theta_B = B_heads/B_tosses.size
(a, theta_A)
(b, theta_B)
Case 2 Identity of coin being tossed is unknown¶
When the identity of coin being tossed is unknwon we rely on Expectation Maximisation to give us the estimates of $\theta_A$ and $\theta_B$. We start with an initial value of $\theta_A, \theta_B$ and then given the observed data (the 50 coin tosses) run the 'E-step' calculating the probability of coin $A$ or $B$ being used for a series of tosses(Remember each set of 10 coin tosses is done using
thetaA = 0.6
thetaB = 0.5
def em(theta_old):
row_prob = []
## Expectation
for row in coin_toss:
count_heads = row.count('H')
p_a = binomial.pmf(count_heads, len(row), theta_old['A'])
p_b = binomial.pmf(count_heads, len(row), theta_old['B'])
p_t = p_a+p_b
p_a = p_a/p_t
p_b = p_b/p_t
row_prob.append({'A': p_a, 'B': p_b, 'count_heads': count_heads, 'total_tosses': len(row)})
## Maximisation
new_coin_toss = []
for row in row_prob:
total_tosses = row['total_tosses']
total_heads = row['count_heads']
A_heads = row['A']*total_heads
A_tails = row['A']*(total_tosses-total_heads)
B_heads = row['B']*total_heads
B_tails = row['B']*(total_tosses-total_heads)
new_coin_toss.append([A_heads, A_tails, B_heads, B_tails])
df = pd.DataFrame(new_coin_toss, columns=['A Heads', 'A Tails', 'B Heads', 'B Tails'])
new_pa = df['A Heads'].sum()/(df['A Heads'].sum()+df['A Tails'].sum())
new_pb = df['B Heads'].sum()/(df['B Heads'].sum()+df['B Tails'].sum())
new_theta = OrderedDict({'A': new_pa, 'B': new_pb})
display(df)
return new_theta
theta = OrderedDict({'A': thetaA, 'B': thetaB})
theta_new = OrderedDict()
max_iterations = 10000
iterations = 0
diff = 1
tolerance = 1e-6
while (iterations < max_iterations) and (diff>tolerance):
new_theta = em(theta)
diff = euclidean(new_theta.values(), theta.values())
theta = new_theta
(a, new_theta['A'])
(b, new_theta['B'])
Comments