from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import warnings
warnings.simplefilter("ignore")
def difference_of_means(table, numeric_label, group_label):
"""
Takes: name of table, column label of numerical variable,
column label of group-label variable
Returns: Difference of means of the two groups
"""
#table with the two relevant columns
reduced = table.select(numeric_label, group_label)
# table containing group means
means_table = reduced.group(group_label, np.average)
# array of group means
means = means_table.column(1)
return means.item(1) - means.item(0)
def one_simulated_difference(table, numeric_label, group_label):
"""
Takes: name of table, column label of numerical variable,
column label of group-label variable
Returns: Difference of means of the two groups after shuffling labels
"""
# array of shuffled labels
shuffled_labels = table.sample(
with_replacement = False).column(group_label)
# table of numerical variable and shuffled labels
shuffled_table = table.select(numeric_label).with_column(
'Shuffled Label', shuffled_labels)
return difference_of_means(
shuffled_table, numeric_label, 'Shuffled Label')
births = Table.read_table('baby.csv')
births.group('Maternal Smoker', np.average)
Maternal Smoker | Birth Weight average | Gestational Days average | Maternal Age average | Maternal Height average | Maternal Pregnancy Weight average |
---|---|---|---|---|---|
False | 123.085 | 279.874 | 27.5441 | 64.014 | 129.48 |
True | 113.819 | 277.898 | 26.7364 | 64.1046 | 126.919 |
botox = Table.read_table('bta.csv')
botox.show()
Group | Result |
---|---|
Control | 1 |
Control | 1 |
Control | 0 |
Control | 0 |
Control | 0 |
Control | 0 |
Control | 0 |
Control | 0 |
Control | 0 |
Control | 0 |
Control | 0 |
Control | 0 |
Control | 0 |
Control | 0 |
Control | 0 |
Control | 0 |
Treatment | 1 |
Treatment | 1 |
Treatment | 1 |
Treatment | 1 |
Treatment | 1 |
Treatment | 1 |
Treatment | 1 |
Treatment | 1 |
Treatment | 1 |
Treatment | 0 |
Treatment | 0 |
Treatment | 0 |
Treatment | 0 |
Treatment | 0 |
Treatment | 0 |
botox.pivot('Result', 'Group')
Group | 0.0 | 1.0 |
---|---|---|
Control | 14 | 2 |
Treatment | 6 | 9 |
botox.group('Group', np.average)
Group | Result average |
---|---|
Control | 0.125 |
Treatment | 0.6 |
observed_diff = difference_of_means(botox, 'Result', 'Group')
observed_diff
0.475
one_simulated_difference(botox, 'Result', 'Group')
0.08750000000000002
simulated_diffs = make_array()
for i in np.arange(10000):
sim_diff = one_simulated_difference(botox, 'Result', 'Group')
simulated_diffs = np.append(simulated_diffs, sim_diff)
col_name = 'Distances between groups'
Table().with_column(col_name, simulated_diffs).hist(col_name)
# p-value
sum(simulated_diffs >= observed_diff)/len(simulated_diffs)
0.0060000000000000001