import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')
import warnings
warnings.simplefilter(action='ignore')
united = Table.read_table('united.csv')
united_bins = np.arange(-20, 300, 10)
united.hist('Delay', bins=united_bins)
delays = united.column('Delay')
population_mean = np.mean(delays)
population_sd = np.std(delays)
population_mean, population_sd
(16.658155515370705, 39.480199851609314)
def one_sample_mean(sample_size):
"""Take a sample from the population of flights and compute its mean"""
sampled_flights = united.sample(sample_size)
return np.mean(sampled_flights.column('Delay'))
def ten_thousand_sample_means(sample_size):
"""Approximate the distribution of the sample mean"""
means = make_array()
for i in np.arange(10000):
mean = one_sample_mean(sample_size)
means = np.append(means, mean)
return means
sample_means_400 = ten_thousand_sample_means(400)
Table().with_column('Mean of 400 flight delays', sample_means_400).hist(bins=20)
print('Population Average:', population_mean)
Population Average: 16.6581555154
How many possible ways are there that the sample could have come out?
united.num_rows
13825
# How many possible sample means are there?
united.num_rows ** 400
1845253300060122534684058597421182951017338738756884128476156537109395559702295050837795610986469015706417701209704203890696502616877632733167737247128710898267622177602279004150237321497531712957437744194467949953310673085563343687922543234838511385550568262088418334016217887061735745358842456577208939555740404373614633941136938352510678553686140728842897559436843191863609514780562396147921834537075860636817493656816017587537752125750151805566479543166742758254440594398100342650860455445795087942680221047750947255562969782801791748639952850756659518870235020316513575934561122027710440608023538776721761349403194150575014182981349346980577699633375066811153948871815566280319121565243984196831524157664160526536071758298269096548661601001356951548310460921171197940711389849058290284443729287317331793254191518876765178731748453564076631795997590039640387156475975294002056115371653663653810384491550154761367793765502453546587829476276348569091617961438914859251316410793595152416543512156519176222525375686012456475153300949448237257016682314992901731108854714280100481240308330277787527057569352848422077629847824096271549220911775951976626139238541198474345951971545910182363395101363835800498012434345133091636761182032795746446844142448244919867423330981077698924182274597759026544920974656341161972764585310457415429622273840896606709916142504003925039710004668567134806122448204818515414944603415501242828559573349452338412633791865156682427204248905349875523364526656394479383566492244844079971495826454577423053062574743146215624961827622221237394742408158154755584214993725965536145817076603276572256684406880822280072607100009918212890625
sample_means_900 = ten_thousand_sample_means(900)
means_tbl = Table().with_columns(
'400', sample_means_400,
'900', sample_means_900,
)
means_tbl.hist(bins = np.arange(5, 31, 0.5))
plots.title('Distribution of Sample Average');
"""Empirical distribution of random sample means"""
def plot_sample_means(sample_size):
sample_means = ten_thousand_sample_means(sample_size)
sample_means_tbl = Table().with_column('Sample Means', sample_means)
# Print some information about the distribution of the sample means
print("Sample size: ", sample_size)
print("Population mean:", population_mean)
print("Average of sample means: ", np.mean(sample_means))
print("Population SD:", population_sd)
print("SD of sample means:", np.std(sample_means))
# Plot a histogram of the sample means
sample_means_tbl.hist(bins=20)
plots.xlabel('Sample Means')
plots.title('Sample Size ' + str(sample_size))
plot_sample_means(100)
Sample size: 100 Population mean: 16.6581555154 Average of sample means: 16.704702 Population SD: 39.4801998516 SD of sample means: 3.98765981388
39.48 / 3.932
10.040691759918616
plot_sample_means(400)
Sample size: 400 Population mean: 16.6581555154 Average of sample means: 16.69335 Population SD: 39.4801998516 SD of sample means: 1.97027797817
39.48 / 1.973
20.010136847440442
plot_sample_means(625)
Sample size: 625 Population mean: 16.6581555154 Average of sample means: 16.64441152 Population SD: 39.4801998516 SD of sample means: 1.56524719602
39.48 / 1.577
25.034876347495242
39.48 / np.sqrt(100)
3.9479999999999995
39.48 / np.sqrt(400)
1.9739999999999998
39.48 / np.sqrt(625)
1.5791999999999999
# Warning: this cell will take a long time to run!
sample_sizes = np.arange(100, 950, 50)
sample_mean_sds = make_array()
for n in sample_sizes:
sample_means = ten_thousand_sample_means(n)
sample_mean_sds = np.append(sample_mean_sds, np.std(sample_means))
sd_table = Table().with_columns(
'Sample size', sample_sizes,
'SD of simulated sample means', sample_mean_sds,
'Pop SD / sqrt(sample size)', population_sd / np.sqrt(sample_sizes),
)
sd_table
Sample size | SD of simulated sample means | Pop SD / sqrt(sample size) |
---|---|---|
100 | 3.92947 | 3.94802 |
150 | 3.23346 | 3.22354 |
200 | 2.80173 | 2.79167 |
250 | 2.50378 | 2.49695 |
300 | 2.28177 | 2.27939 |
350 | 2.1184 | 2.11031 |
400 | 1.97849 | 1.97401 |
450 | 1.84082 | 1.86111 |
500 | 1.75102 | 1.76561 |
550 | 1.67135 | 1.68344 |
... (7 rows omitted)
sd_table.iscatter('Sample size')