import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')
From a random sample, construct a 95% confidence interval for the ages of the mothers in the population.
# original sample
births = Table.read_table('baby.csv')
births.show(3)
| Birth Weight | Gestational Days | Maternal Age | Maternal Height | Maternal Pregnancy Weight | Maternal Smoker | 
|---|---|---|---|---|---|
| 120 | 284 | 27 | 62 | 100 | False | 
| 113 | 282 | 33 | 64 | 135 | False | 
| 128 | 279 | 28 | 64 | 115 | True | 
... (1171 rows omitted)
def one_bootstrap_mean():
    resample = births.sample()
    return np.average(resample.column('Maternal Age'))
# Generate means from 3000 bootstrap samples
num_repetitions = 3000
bstrap_means = make_array()
for i in np.arange(num_repetitions):
    bstrap_means = np.append(bstrap_means, one_bootstrap_mean())
# Get the endpoints of the 95% confidence interval
left = percentile(2.5, bstrap_means)
right = percentile(97.5, bstrap_means)
print(f"Left End: {left}")
print(f"Right End: {right}")
Left End: 26.906303236797275 Right End: 27.579216354344123
resampled_means = Table().with_columns(
    'Bootstrap Sample Mean', bstrap_means
)
resampled_means.hist(bins=15)
plots.plot([left, right], [0, 0], color='yellow', lw=8, label='95% CI')
plots.legend();
sampled_ages = births.column('Maternal Age')
sample_size = len(sampled_ages)
sample_average = np.average(sampled_ages)
sample_SD = np.std(sampled_ages)
print(f"Sample Size: {sample_size}")
print(f"Sample Average: {sample_average}")
print(f"Sample SD: {sample_SD}")
Sample Size: 1174 Sample Average: 27.228279386712096 Sample SD: 5.815360404190897
We need to add and subtract $2 \cdot ( \frac{\text{Population SD}}{\sqrt{\text{Sample_Size}}})$ but we don't have the population SD.
# Try estimating it from the sample
estimated_SD_of_sample_average = sample_SD / (sample_size**0.5)
estimated_SD_of_sample_average
0.16972373742299796
# Approximate 95% confidence interval for population mean
sample_average - 2*estimated_SD_of_sample_average, sample_average + 2*estimated_SD_of_sample_average
(26.888831911866099, 27.567726861558093)
# population of size 10
number_of_ones = 4
zero_one_population = np.append(np.ones(number_of_ones), np.zeros(10 - number_of_ones))
print(f"Population: {zero_one_population}")
print(f"Standard Deviation: {np.round(np.std(zero_one_population),2)}")
Population: [ 1. 1. 1. 1. 0. 0. 0. 0. 0. 0.] Standard Deviation: 0.49
def sd_of_zero_one_population(number_of_ones):
    """Returns the SD of a population 
    that has 10 elements: num_ones with value 1 and (10 - num_ones) with value 0"""
    zero_one_population = np.append(np.ones(number_of_ones), 
                                    np.zeros(10 - number_of_ones))
    return np.std(zero_one_population)
possible_ones = np.arange(11)
zero_one_pop = Table().with_columns(
    'Number of Ones', possible_ones,
    'Proportion of Ones', possible_ones / 10
)
zero_one_pop.show()
| Number of Ones | Proportion of Ones | 
|---|---|
| 0 | 0 | 
| 1 | 0.1 | 
| 2 | 0.2 | 
| 3 | 0.3 | 
| 4 | 0.4 | 
| 5 | 0.5 | 
| 6 | 0.6 | 
| 7 | 0.7 | 
| 8 | 0.8 | 
| 9 | 0.9 | 
| 10 | 1 | 
sds = zero_one_pop.apply(sd_of_zero_one_population, 'Number of Ones')
zero_one_pop = zero_one_pop.with_column('Pop SD', sds)
zero_one_pop.show()
| Number of Ones | Proportion of Ones | Pop SD | 
|---|---|---|
| 0 | 0 | 0 | 
| 1 | 0.1 | 0.3 | 
| 2 | 0.2 | 0.4 | 
| 3 | 0.3 | 0.458258 | 
| 4 | 0.4 | 0.489898 | 
| 5 | 0.5 | 0.5 | 
| 6 | 0.6 | 0.489898 | 
| 7 | 0.7 | 0.458258 | 
| 8 | 0.8 | 0.4 | 
| 9 | 0.9 | 0.3 | 
| 10 | 1 | 0 | 
zero_one_pop.iscatter('Proportion of Ones', 'Pop SD')