In [1]:
import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')

import warnings
warnings.simplefilter(action='ignore')

Distribution of the Sample Average¶

In [2]:
united = Table.read_table('united.csv')
united_bins = np.arange(-20, 300, 10)
united.hist('Delay', bins=united_bins)
In [3]:
delays = united.column('Delay')
population_mean = np.mean(delays)
population_sd = np.std(delays)
population_mean, population_sd
Out[3]:
(16.658155515370705, 39.480199851609314)
In [4]:
def one_sample_mean(sample_size):
    """Take a sample from the population of flights and compute its mean"""
    sampled_flights = united.sample(sample_size)
    return np.mean(sampled_flights.column('Delay'))
In [5]:
def ten_thousand_sample_means(sample_size):
    """Approximate the distribution of the sample mean"""
    means = make_array()
    for i in np.arange(10000):
        mean = one_sample_mean(sample_size)
        means = np.append(means, mean)
    return means
In [6]:
sample_means_400 = ten_thousand_sample_means(400)
Table().with_column('Mean of 400 flight delays', sample_means_400).hist(bins=20)
print('Population Average:', population_mean)
Population Average: 16.6581555154

How many possible ways are there that the sample could have come out?

In [7]:
united.num_rows
Out[7]:
13825
In [8]:
# How many possible sample means are there?
united.num_rows ** 400
Out[8]:
1845253300060122534684058597421182951017338738756884128476156537109395559702295050837795610986469015706417701209704203890696502616877632733167737247128710898267622177602279004150237321497531712957437744194467949953310673085563343687922543234838511385550568262088418334016217887061735745358842456577208939555740404373614633941136938352510678553686140728842897559436843191863609514780562396147921834537075860636817493656816017587537752125750151805566479543166742758254440594398100342650860455445795087942680221047750947255562969782801791748639952850756659518870235020316513575934561122027710440608023538776721761349403194150575014182981349346980577699633375066811153948871815566280319121565243984196831524157664160526536071758298269096548661601001356951548310460921171197940711389849058290284443729287317331793254191518876765178731748453564076631795997590039640387156475975294002056115371653663653810384491550154761367793765502453546587829476276348569091617961438914859251316410793595152416543512156519176222525375686012456475153300949448237257016682314992901731108854714280100481240308330277787527057569352848422077629847824096271549220911775951976626139238541198474345951971545910182363395101363835800498012434345133091636761182032795746446844142448244919867423330981077698924182274597759026544920974656341161972764585310457415429622273840896606709916142504003925039710004668567134806122448204818515414944603415501242828559573349452338412633791865156682427204248905349875523364526656394479383566492244844079971495826454577423053062574743146215624961827622221237394742408158154755584214993725965536145817076603276572256684406880822280072607100009918212890625
In [9]:
sample_means_900 = ten_thousand_sample_means(900)
In [10]:
means_tbl = Table().with_columns(
    '400', sample_means_400,
    '900', sample_means_900,
)
In [11]:
means_tbl.hist(bins = np.arange(5, 31, 0.5))
plots.title('Distribution of Sample Average');

Relationship Between Population SD and Sample Size¶

In [12]:
"""Empirical distribution of random sample means"""
def plot_sample_means(sample_size):
    sample_means = ten_thousand_sample_means(sample_size)
    sample_means_tbl = Table().with_column('Sample Means', sample_means)
    
    # Print some information about the distribution of the sample means
    print("Sample size: ", sample_size)
    print("Population mean:", population_mean)
    print("Average of sample means: ", np.mean(sample_means))
    print("Population SD:", population_sd)
    print("SD of sample means:", np.std(sample_means))

    # Plot a histogram of the sample means
    sample_means_tbl.hist(bins=20)
    plots.xlabel('Sample Means')
    plots.title('Sample Size ' + str(sample_size))
In [13]:
plot_sample_means(100)
Sample size:  100
Population mean: 16.6581555154
Average of sample means:  16.704702
Population SD: 39.4801998516
SD of sample means: 3.98765981388
In [14]:
39.48 / 3.932
Out[14]:
10.040691759918616
In [15]:
plot_sample_means(400)
Sample size:  400
Population mean: 16.6581555154
Average of sample means:  16.69335
Population SD: 39.4801998516
SD of sample means: 1.97027797817
In [16]:
39.48 / 1.973
Out[16]:
20.010136847440442
In [17]:
plot_sample_means(625)
Sample size:  625
Population mean: 16.6581555154
Average of sample means:  16.64441152
Population SD: 39.4801998516
SD of sample means: 1.56524719602
In [18]:
39.48 / 1.577
Out[18]:
25.034876347495242
In [19]:
39.48 / np.sqrt(100)
Out[19]:
3.9479999999999995
In [20]:
39.48 / np.sqrt(400)
Out[20]:
1.9739999999999998
In [21]:
39.48 / np.sqrt(625)
Out[21]:
1.5791999999999999

Variability of the Sample Mean¶

In [22]:
# Warning: this cell will take a long time to run!
sample_sizes = np.arange(100, 950, 50)

sample_mean_sds = make_array()
for n in sample_sizes:
    sample_means = ten_thousand_sample_means(n)
    sample_mean_sds = np.append(sample_mean_sds, np.std(sample_means))
In [23]:
sd_table = Table().with_columns(
    'Sample size', sample_sizes,
    'SD of simulated sample means', sample_mean_sds,
    'Pop SD / sqrt(sample size)', population_sd / np.sqrt(sample_sizes),
)
sd_table
Out[23]:
Sample size SD of simulated sample means Pop SD / sqrt(sample size)
100 3.92947 3.94802
150 3.23346 3.22354
200 2.80173 2.79167
250 2.50378 2.49695
300 2.28177 2.27939
350 2.1184 2.11031
400 1.97849 1.97401
450 1.84082 1.86111
500 1.75102 1.76561
550 1.67135 1.68344

... (7 rows omitted)

In [24]:
sd_table.iscatter('Sample size')

Other distributions¶

In [25]:
sf_salaries = Table.read_table("san_francisco_2019.csv")
sf_salaries.hist("Salary")
In [26]:
def one_sample_mean(sample_size):
    """Take a sample from the population of flights and compute its mean"""
    sampled_salaries = sf_salaries.sample(sample_size)
    return np.mean(sampled_salaries.column('Salary'))
In [27]:
def ten_thousand_sample_means(sample_size):
    """Approximate the distribution of the sample mean"""
    means = make_array()
    for i in np.arange(10000):
        mean = one_sample_mean(sample_size)
        means = np.append(means, mean)
    return means
In [28]:
sample_means_400 = ten_thousand_sample_means(400)
Table().with_column('Mean of 400 salaries', sample_means_400).hist(bins=20)
print('Population Average:', np.mean(sf_salaries.column("Salary")))
Population Average: 80608.7076025
In [ ]:
 
In [ ]: