from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
jury = Table().with_columns(
'Ethnicity', make_array('Asian', 'Black', 'Latino', 'White', 'Other'),
'Eligible', make_array(0.15, 0.18, 0.12, 0.54, 0.01),
'Panels', make_array(0.26, 0.08, 0.08, 0.54, 0.04)
)
jury
Ethnicity | Eligible | Panels |
---|---|---|
Asian | 0.15 | 0.26 |
Black | 0.18 | 0.08 |
Latino | 0.12 | 0.08 |
White | 0.54 | 0.54 |
Other | 0.01 | 0.04 |
jury.barh('Ethnicity')
# Under the model, this is the true distribution of people
# from which the jurors are randomly sampled
model = make_array(0.15, 0.18, 0.12, 0.54, 0.01)
# Let's simulate a random draw of 1423 jurors from this distribution
simulated = sample_proportions(1423, model)
simulated
array([ 0.17427969, 0.16303584, 0.11384399, 0.53759663, 0.01124385])
# The actual observed distribution (Panels) looks quite different
# from the simulation -- try running this several times to confirm!
jury_with_simulated = jury.with_column('Simulated', simulated)
jury_with_simulated
Ethnicity | Eligible | Panels | Simulated |
---|---|---|---|
Asian | 0.15 | 0.26 | 0.17428 |
Black | 0.18 | 0.08 | 0.163036 |
Latino | 0.12 | 0.08 | 0.113844 |
White | 0.54 | 0.54 | 0.537597 |
Other | 0.01 | 0.04 | 0.0112439 |
jury_with_simulated.barh('Ethnicity')
# In the last lecture, the difference between observed black/purple
# and their expected values (26%/75%) was our statistic.
#
# In this case, we need to understand how each of the 5 categories
# differ from their expected values according to the model.
diffs = jury.column('Panels') - jury.column('Eligible')
jury_with_difference = jury.with_column('Difference', diffs)
jury_with_difference
Ethnicity | Eligible | Panels | Difference |
---|---|---|---|
Asian | 0.15 | 0.26 | 0.11 |
Black | 0.18 | 0.08 | -0.1 |
Latino | 0.12 | 0.08 | -0.04 |
White | 0.54 | 0.54 | 0 |
Other | 0.01 | 0.04 | 0.03 |
def tvd(dist1, dist2):
return sum(abs(dist1 - dist2))/2
# The TVD of our observed data (Panels) from their expected values
# assuming the model is true (Eligbible)
obsvd_tvd = tvd(jury.column('Panels'), jury.column('Eligible'))
obsvd_tvd
0.14000000000000001
# The TVD of a model simluation from its expected values
tvd(sample_proportions(1423, model), jury.column('Eligible'))
0.0035207308503162615
def simulated_tvd():
return tvd(sample_proportions(1423, model), model)
tvds = make_array()
num_simulations = 10000
for i in np.arange(num_simulations):
new_tvd = simulated_tvd()
tvds = np.append(tvds, new_tvd)
title = 'Simulated TVDs (if model is true)'
bins = np.arange(0, .05, .005)
Table().with_column(title, tvds).hist(bins = bins)
print('Observed TVD: ' + str(obsvd_tvd))
# Plotting details; ignore this code
plots.ylim(-2, 55)
plots.scatter(obsvd_tvd, 0, color='red', s=30);
Observed TVD: 0.14