from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import warnings
warnings.simplefilter("ignore")
def difference_of_means(table, numeric_label, group_label):
    """
    Takes: name of table, column label of numerical variable,
    column label of group-label variable
    
    Returns: Difference of means of the two groups
    """
    
    #table with the two relevant columns
    reduced = table.select(numeric_label, group_label)  
    
    # table containing group means
    means_table = reduced.group(group_label, np.average)
    
    # array of group means
    means = means_table.column(1)
    
    return means.item(1) - means.item(0)
def one_simulated_difference(table, numeric_label, group_label):
    """
    Takes: name of table, column label of numerical variable,
    column label of group-label variable
    
    Returns: Difference of means of the two groups after shuffling labels
    """
    
    # array of shuffled labels
    shuffled_labels = table.sample(
        with_replacement = False).column(group_label)
    
    # table of numerical variable and shuffled labels
    shuffled_table = table.select(numeric_label).with_column(
        'Shuffled Label', shuffled_labels)
    
    return difference_of_means(
        shuffled_table, numeric_label, 'Shuffled Label')   
births = Table.read_table('baby.csv')
births.group('Maternal Smoker', np.average)
| Maternal Smoker | Birth Weight average | Gestational Days average | Maternal Age average | Maternal Height average | Maternal Pregnancy Weight average | 
|---|---|---|---|---|---|
| False | 123.085 | 279.874 | 27.5441 | 64.014 | 129.48 | 
| True | 113.819 | 277.898 | 26.7364 | 64.1046 | 126.919 | 
botox = Table.read_table('bta.csv')
botox.show()
| Group | Result | 
|---|---|
| Control | 1 | 
| Control | 1 | 
| Control | 0 | 
| Control | 0 | 
| Control | 0 | 
| Control | 0 | 
| Control | 0 | 
| Control | 0 | 
| Control | 0 | 
| Control | 0 | 
| Control | 0 | 
| Control | 0 | 
| Control | 0 | 
| Control | 0 | 
| Control | 0 | 
| Control | 0 | 
| Treatment | 1 | 
| Treatment | 1 | 
| Treatment | 1 | 
| Treatment | 1 | 
| Treatment | 1 | 
| Treatment | 1 | 
| Treatment | 1 | 
| Treatment | 1 | 
| Treatment | 1 | 
| Treatment | 0 | 
| Treatment | 0 | 
| Treatment | 0 | 
| Treatment | 0 | 
| Treatment | 0 | 
| Treatment | 0 | 
botox.pivot('Result', 'Group')
| Group | 0.0 | 1.0 | 
|---|---|---|
| Control | 14 | 2 | 
| Treatment | 6 | 9 | 
botox.group('Group', np.average)
| Group | Result average | 
|---|---|
| Control | 0.125 | 
| Treatment | 0.6 | 
observed_diff = difference_of_means(botox, 'Result', 'Group')
observed_diff
0.475
one_simulated_difference(botox, 'Result', 'Group')
0.08750000000000002
simulated_diffs = make_array()
for i in np.arange(10000):
    sim_diff = one_simulated_difference(botox, 'Result', 'Group')
    simulated_diffs = np.append(simulated_diffs, sim_diff)
col_name = 'Distances between groups'
Table().with_column(col_name, simulated_diffs).hist(col_name)
# p-value
sum(simulated_diffs >= observed_diff)/len(simulated_diffs)
0.0060000000000000001