In [1]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

import warnings
warnings.simplefilter("ignore")

Review: Comparing Two Samples¶

In [2]:
def difference_of_means(table, numeric_label, group_label):
    """
    Takes: name of table, column label of numerical variable,
    column label of group-label variable
    
    Returns: Difference of means of the two groups
    """
    
    #table with the two relevant columns
    reduced = table.select(numeric_label, group_label)  
    
    # table containing group means
    means_table = reduced.group(group_label, np.average)
    
    # array of group means
    means = means_table.column(1)
    
    return means.item(1) - means.item(0)
In [3]:
def one_simulated_difference(table, numeric_label, group_label):
    """
    Takes: name of table, column label of numerical variable,
    column label of group-label variable
    
    Returns: Difference of means of the two groups after shuffling labels
    """
    
    # array of shuffled labels
    shuffled_labels = table.sample(
        with_replacement = False).column(group_label)
    
    # table of numerical variable and shuffled labels
    shuffled_table = table.select(numeric_label).with_column(
        'Shuffled Label', shuffled_labels)
    
    return difference_of_means(
        shuffled_table, numeric_label, 'Shuffled Label')   
In [4]:
births = Table.read_table('baby.csv')
In [5]:
births.group('Maternal Smoker', np.average)
Out[5]:
Maternal Smoker Birth Weight average Gestational Days average Maternal Age average Maternal Height average Maternal Pregnancy Weight average
False 123.085 279.874 27.5441 64.014 129.48
True 113.819 277.898 26.7364 64.1046 126.919

Randomized Control Experiment¶

In [6]:
botox = Table.read_table('bta.csv')
botox.show()
Group Result
Control 1
Control 1
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Treatment 1
Treatment 1
Treatment 1
Treatment 1
Treatment 1
Treatment 1
Treatment 1
Treatment 1
Treatment 1
Treatment 0
Treatment 0
Treatment 0
Treatment 0
Treatment 0
Treatment 0
In [7]:
botox.pivot('Result', 'Group')
Out[7]:
Group 0.0 1.0
Control 14 2
Treatment 6 9
In [8]:
botox.group('Group', np.average)
Out[8]:
Group Result average
Control 0.125
Treatment 0.6

Testing the Hypothesis¶

In [9]:
observed_diff = difference_of_means(botox, 'Result', 'Group')
observed_diff
Out[9]:
0.475
In [10]:
one_simulated_difference(botox, 'Result', 'Group')
Out[10]:
0.08750000000000002
In [11]:
simulated_diffs = make_array()

for i in np.arange(10000):
    sim_diff = one_simulated_difference(botox, 'Result', 'Group')
    simulated_diffs = np.append(simulated_diffs, sim_diff)
In [12]:
col_name = 'Distances between groups'
Table().with_column(col_name, simulated_diffs).hist(col_name)
In [13]:
# p-value
sum(simulated_diffs >= observed_diff)/len(simulated_diffs)
Out[13]:
0.0060000000000000001