from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

import warnings
warnings.simplefilter("ignore")


births = Table.read_table('baby.csv')


births


smoking_and_birthweight = births.select('Maternal Smoker', 'Birth Weight')


smoking_and_birthweight.group('Maternal Smoker')


smoking_and_birthweight.hist('Birth Weight', group='Maternal Smoker')


means_table = smoking_and_birthweight.group('Maternal Smoker', np.average)
means_table


means = means_table.column(1)
observed_difference = means.item(1) - means.item(0)
observed_difference

-9.266142572024918


def difference_of_means(table, numeric_label, category_label):
    """
    Takes: 
       - name of table
       - column label of numerical variable
       - column label of categorical variable
       
    Returns: Difference of means of the two groups
    """
    
    #table with the two relevant columns
    reduced = table.select(numeric_label, category_label)  
    
    # table containing group means
    means_table = reduced.group(category_label, np.average)
    
    # array of group means
    means = means_table.column(1)
    
    return means.item(1) - means.item(0)


difference_of_means(births, 'Birth Weight', 'Maternal Smoker')

-9.266142572024918


staff = Table().with_columns(
    'Names', make_array('Jim', 'Pam', 'Dwight', 'Michael'),
    'Ages', make_array(29, 28, 34, 41)
)


staff.sample()


staff.sample(with_replacement = False)


staff.with_column('Shuffled', staff.sample(with_replacement = False).column(0))


smoking_and_birthweight


shuffled_labels = smoking_and_birthweight.sample(with_replacement=False).column('Maternal Smoker')


original_and_shuffled = smoking_and_birthweight.with_column(
    'Shuffled Label', shuffled_labels
)


original_and_shuffled


difference_of_means(original_and_shuffled, 'Birth Weight', 'Shuffled Label')

-1.3711504182092398


difference_of_means(original_and_shuffled, 'Birth Weight', 'Maternal Smoker')

-9.266142572024918


def one_simulated_difference(table, numeric_label, category_label):
    """
    Takes: 
       - name of table
       - column label of numerical variable
       - column label of categorical variable
       
    Returns: Difference of means of the two groups
    """
    
    # array of shuffled labels
    shuffled_labels = table.sample(with_replacement = False).column(category_label)
    
    # table of numerical variable and shuffled labels
    shuffled_table = table.select(numeric_label).with_column('Shuffled Label', shuffled_labels)
    
    return difference_of_means(shuffled_table, numeric_label, 'Shuffled Label')


one_simulated_difference(births, 'Birth Weight', 'Maternal Smoker')

0.8431677255206722


differences = make_array()

for i in np.arange(2500):
    new_difference = one_simulated_difference(births, 'Birth Weight', 'Maternal Smoker')
    differences = np.append(differences, new_difference)


Table().with_column('Difference Between Group Means', differences).hist()
print('Observed Difference:', observed_difference)
plots.title('Prediction Under the Null Hypothesis');

Observed Difference: -9.266142572024918

Birth Weight	Gestational Days	Maternal Age	Maternal Height	Maternal Pregnancy Weight	Maternal Smoker
120	284	27	62	100	False
113	282	33	64	135	False
128	279	28	64	115	True
108	282	23	67	125	True
136	286	25	62	93	False
138	244	33	62	178	False
132	245	23	65	140	False
120	289	25	62	125	False
143	299	30	66	136	True
140	351	27	68	120	False

Maternal Smoker	count
False	715
True	459

Maternal Smoker	Birth Weight average
False	123.085
True	113.819

Maternal Smoker	Birth Weight
False	120
False	113
True	128
True	108
False	136
False	138
False	132
False	120
True	143
False	140

Maternal Smoker	Birth Weight	Shuffled Label
False	120	False
False	113	True
True	128	True
True	108	False
False	136	False
False	138	False
False	132	False
False	120	False
True	143	True
False	140	True

Comparing Two Samples¶

Test Statistic¶

Random Permutation (Shuffling)¶

Simulation Under Null Hypothesis¶

Permutation Test¶

Names	Ages
Dwight	34
Jim	29
Dwight	34
Michael	41

Names	Ages
Michael	41
Pam	28
Jim	29
Dwight	34

Names	Ages	Shuffled
Jim	29	Michael
Pam	28	Dwight
Dwight	34	Jim
Michael	41	Pam