from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import warnings
warnings.simplefilter("ignore")
births = Table.read_table('baby.csv')
births
| Birth Weight | Gestational Days | Maternal Age | Maternal Height | Maternal Pregnancy Weight | Maternal Smoker |
|---|---|---|---|---|---|
| 120 | 284 | 27 | 62 | 100 | False |
| 113 | 282 | 33 | 64 | 135 | False |
| 128 | 279 | 28 | 64 | 115 | True |
| 108 | 282 | 23 | 67 | 125 | True |
| 136 | 286 | 25 | 62 | 93 | False |
| 138 | 244 | 33 | 62 | 178 | False |
| 132 | 245 | 23 | 65 | 140 | False |
| 120 | 289 | 25 | 62 | 125 | False |
| 143 | 299 | 30 | 66 | 136 | True |
| 140 | 351 | 27 | 68 | 120 | False |
... (1164 rows omitted)
smoking_and_birthweight = births.select('Maternal Smoker', 'Birth Weight')
smoking_and_birthweight.group('Maternal Smoker')
| Maternal Smoker | count |
|---|---|
| False | 715 |
| True | 459 |
smoking_and_birthweight.hist('Birth Weight', group='Maternal Smoker')
[Question] What values of our statistic are in favor of the alternative: positive or negative?
means_table = smoking_and_birthweight.group('Maternal Smoker', np.average)
means_table
| Maternal Smoker | Birth Weight average |
|---|---|
| False | 123.085 |
| True | 113.819 |
means = means_table.column(1)
observed_difference = means.item(1) - means.item(0)
observed_difference
-9.266142572024918
def difference_of_means(table, numeric_label, category_label):
"""
Takes:
- name of table
- column label of numerical variable
- column label of categorical variable
Returns: Difference of means of the two groups
"""
#table with the two relevant columns
reduced = table.select(numeric_label, category_label)
# table containing group means
means_table = reduced.group(category_label, np.average)
# array of group means
means = means_table.column(1)
return means.item(1) - means.item(0)
difference_of_means(births, 'Birth Weight', 'Maternal Smoker')
-9.266142572024918
staff = Table().with_columns(
'Names', make_array('Jim', 'Pam', 'Dwight', 'Michael'),
'Ages', make_array(29, 28, 34, 41)
)
staff.sample()
| Names | Ages |
|---|---|
| Dwight | 34 |
| Jim | 29 |
| Dwight | 34 |
| Michael | 41 |
staff.sample(with_replacement = False)
| Names | Ages |
|---|---|
| Michael | 41 |
| Pam | 28 |
| Jim | 29 |
| Dwight | 34 |
staff.with_column('Shuffled', staff.sample(with_replacement = False).column(0))
| Names | Ages | Shuffled |
|---|---|---|
| Jim | 29 | Michael |
| Pam | 28 | Dwight |
| Dwight | 34 | Jim |
| Michael | 41 | Pam |
smoking_and_birthweight
| Maternal Smoker | Birth Weight |
|---|---|
| False | 120 |
| False | 113 |
| True | 128 |
| True | 108 |
| False | 136 |
| False | 138 |
| False | 132 |
| False | 120 |
| True | 143 |
| False | 140 |
... (1164 rows omitted)
shuffled_labels = smoking_and_birthweight.sample(with_replacement=False).column('Maternal Smoker')
original_and_shuffled = smoking_and_birthweight.with_column(
'Shuffled Label', shuffled_labels
)
original_and_shuffled
| Maternal Smoker | Birth Weight | Shuffled Label |
|---|---|---|
| False | 120 | False |
| False | 113 | True |
| True | 128 | True |
| True | 108 | False |
| False | 136 | False |
| False | 138 | False |
| False | 132 | False |
| False | 120 | False |
| True | 143 | True |
| False | 140 | True |
... (1164 rows omitted)
difference_of_means(original_and_shuffled, 'Birth Weight', 'Shuffled Label')
-1.3711504182092398
difference_of_means(original_and_shuffled, 'Birth Weight', 'Maternal Smoker')
-9.266142572024918
def one_simulated_difference(table, numeric_label, category_label):
"""
Takes:
- name of table
- column label of numerical variable
- column label of categorical variable
Returns: Difference of means of the two groups
"""
# array of shuffled labels
shuffled_labels = table.sample(with_replacement = False).column(category_label)
# table of numerical variable and shuffled labels
shuffled_table = table.select(numeric_label).with_column('Shuffled Label', shuffled_labels)
return difference_of_means(shuffled_table, numeric_label, 'Shuffled Label')
one_simulated_difference(births, 'Birth Weight', 'Maternal Smoker')
0.8431677255206722
differences = make_array()
for i in np.arange(2500):
new_difference = one_simulated_difference(births, 'Birth Weight', 'Maternal Smoker')
differences = np.append(differences, new_difference)
Table().with_column('Difference Between Group Means', differences).hist()
print('Observed Difference:', observed_difference)
plots.title('Prediction Under the Null Hypothesis');
Observed Difference: -9.266142572024918