In [1]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

import warnings
warnings.simplefilter("ignore")

Comparing Two Samples¶

In [2]:
births = Table.read_table('baby.csv')
In [3]:
births
Out[3]:
Birth Weight Gestational Days Maternal Age Maternal Height Maternal Pregnancy Weight Maternal Smoker
120 284 27 62 100 False
113 282 33 64 135 False
128 279 28 64 115 True
108 282 23 67 125 True
136 286 25 62 93 False
138 244 33 62 178 False
132 245 23 65 140 False
120 289 25 62 125 False
143 299 30 66 136 True
140 351 27 68 120 False

... (1164 rows omitted)

In [4]:
smoking_and_birthweight = births.select('Maternal Smoker', 'Birth Weight')
In [5]:
smoking_and_birthweight.group('Maternal Smoker')
Out[5]:
Maternal Smoker count
False 715
True 459
In [6]:
smoking_and_birthweight.hist('Birth Weight', group='Maternal Smoker')

Test Statistic¶

[Question] What values of our statistic are in favor of the alternative: positive or negative?

In [7]:
means_table = smoking_and_birthweight.group('Maternal Smoker', np.average)
means_table
Out[7]:
Maternal Smoker Birth Weight average
False 123.085
True 113.819
In [8]:
means = means_table.column(1)
observed_difference = means.item(1) - means.item(0)
observed_difference
Out[8]:
-9.266142572024918
In [9]:
def difference_of_means(table, numeric_label, category_label):
    """
    Takes: 
       - name of table
       - column label of numerical variable
       - column label of categorical variable
       
    Returns: Difference of means of the two groups
    """
    
    #table with the two relevant columns
    reduced = table.select(numeric_label, category_label)  
    
    # table containing group means
    means_table = reduced.group(category_label, np.average)
    
    # array of group means
    means = means_table.column(1)
    
    return means.item(1) - means.item(0)
In [10]:
difference_of_means(births, 'Birth Weight', 'Maternal Smoker')
Out[10]:
-9.266142572024918

Random Permutation (Shuffling)¶

In [11]:
staff = Table().with_columns(
    'Names', make_array('Jim', 'Pam', 'Dwight', 'Michael'),
    'Ages', make_array(29, 28, 34, 41)
)
In [12]:
staff.sample()
Out[12]:
Names Ages
Dwight 34
Jim 29
Dwight 34
Michael 41
In [13]:
staff.sample(with_replacement = False)
Out[13]:
Names Ages
Michael 41
Pam 28
Jim 29
Dwight 34
In [14]:
staff.with_column('Shuffled', staff.sample(with_replacement = False).column(0))
Out[14]:
Names Ages Shuffled
Jim 29 Michael
Pam 28 Dwight
Dwight 34 Jim
Michael 41 Pam

Simulation Under Null Hypothesis¶

In [15]:
smoking_and_birthweight
Out[15]:
Maternal Smoker Birth Weight
False 120
False 113
True 128
True 108
False 136
False 138
False 132
False 120
True 143
False 140

... (1164 rows omitted)

In [16]:
shuffled_labels = smoking_and_birthweight.sample(with_replacement=False).column('Maternal Smoker')
In [17]:
original_and_shuffled = smoking_and_birthweight.with_column(
    'Shuffled Label', shuffled_labels
)
In [18]:
original_and_shuffled
Out[18]:
Maternal Smoker Birth Weight Shuffled Label
False 120 False
False 113 True
True 128 True
True 108 False
False 136 False
False 138 False
False 132 False
False 120 False
True 143 True
False 140 True

... (1164 rows omitted)

In [19]:
difference_of_means(original_and_shuffled, 'Birth Weight', 'Shuffled Label')
Out[19]:
-1.3711504182092398
In [20]:
difference_of_means(original_and_shuffled, 'Birth Weight', 'Maternal Smoker')
Out[20]:
-9.266142572024918

Permutation Test¶

In [21]:
def one_simulated_difference(table, numeric_label, category_label):
    """
    Takes: 
       - name of table
       - column label of numerical variable
       - column label of categorical variable
       
    Returns: Difference of means of the two groups
    """
    
    # array of shuffled labels
    shuffled_labels = table.sample(with_replacement = False).column(category_label)
    
    # table of numerical variable and shuffled labels
    shuffled_table = table.select(numeric_label).with_column('Shuffled Label', shuffled_labels)
    
    return difference_of_means(shuffled_table, numeric_label, 'Shuffled Label')   
In [22]:
one_simulated_difference(births, 'Birth Weight', 'Maternal Smoker')
Out[22]:
0.8431677255206722
In [23]:
differences = make_array()

for i in np.arange(2500):
    new_difference = one_simulated_difference(births, 'Birth Weight', 'Maternal Smoker')
    differences = np.append(differences, new_difference)
In [24]:
Table().with_column('Difference Between Group Means', differences).hist()
print('Observed Difference:', observed_difference)
plots.title('Prediction Under the Null Hypothesis');
Observed Difference: -9.266142572024918