import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')

import warnings
warnings.simplefilter(action='ignore')


united = Table.read_table('united.csv')
united_bins = np.arange(-20, 300, 10)
united.hist('Delay', bins=united_bins)


delays = united.column('Delay')
population_mean = np.mean(delays)
population_sd = np.std(delays)
population_mean, population_sd

(16.658155515370705, 39.480199851609314)


def one_sample_mean(sample_size):
    """Take a sample from the population of flights and compute its mean"""
    sampled_flights = united.sample(sample_size)
    return np.mean(sampled_flights.column('Delay'))


def ten_thousand_sample_means(sample_size):
    """Approximate the distribution of the sample mean"""
    means = make_array()
    for i in np.arange(10000):
        mean = one_sample_mean(sample_size)
        means = np.append(means, mean)
    return means


sample_means_400 = ten_thousand_sample_means(400)
Table().with_column('Mean of 400 flight delays', sample_means_400).hist(bins=20)
print('Population Average:', population_mean)

Population Average: 16.6581555154


united.num_rows

13825


# How many possible sample means are there?
united.num_rows ** 400

1845253300060122534684058597421182951017338738756884128476156537109395559702295050837795610986469015706417701209704203890696502616877632733167737247128710898267622177602279004150237321497531712957437744194467949953310673085563343687922543234838511385550568262088418334016217887061735745358842456577208939555740404373614633941136938352510678553686140728842897559436843191863609514780562396147921834537075860636817493656816017587537752125750151805566479543166742758254440594398100342650860455445795087942680221047750947255562969782801791748639952850756659518870235020316513575934561122027710440608023538776721761349403194150575014182981349346980577699633375066811153948871815566280319121565243984196831524157664160526536071758298269096548661601001356951548310460921171197940711389849058290284443729287317331793254191518876765178731748453564076631795997590039640387156475975294002056115371653663653810384491550154761367793765502453546587829476276348569091617961438914859251316410793595152416543512156519176222525375686012456475153300949448237257016682314992901731108854714280100481240308330277787527057569352848422077629847824096271549220911775951976626139238541198474345951971545910182363395101363835800498012434345133091636761182032795746446844142448244919867423330981077698924182274597759026544920974656341161972764585310457415429622273840896606709916142504003925039710004668567134806122448204818515414944603415501242828559573349452338412633791865156682427204248905349875523364526656394479383566492244844079971495826454577423053062574743146215624961827622221237394742408158154755584214993725965536145817076603276572256684406880822280072607100009918212890625


sample_means_900 = ten_thousand_sample_means(900)


means_tbl = Table().with_columns(
    '400', sample_means_400,
    '900', sample_means_900,
)


means_tbl.hist(bins = np.arange(5, 31, 0.5))
plots.title('Distribution of Sample Average');


"""Empirical distribution of random sample means"""
def plot_sample_means(sample_size):
    sample_means = ten_thousand_sample_means(sample_size)
    sample_means_tbl = Table().with_column('Sample Means', sample_means)
    
    # Print some information about the distribution of the sample means
    print("Sample size: ", sample_size)
    print("Population mean:", population_mean)
    print("Average of sample means: ", np.mean(sample_means))
    print("Population SD:", population_sd)
    print("SD of sample means:", np.std(sample_means))

    # Plot a histogram of the sample means
    sample_means_tbl.hist(bins=20)
    plots.xlabel('Sample Means')
    plots.title('Sample Size ' + str(sample_size))


plot_sample_means(100)

Sample size:  100
Population mean: 16.6581555154
Average of sample means:  16.704702
Population SD: 39.4801998516
SD of sample means: 3.98765981388


39.48 / 3.932

10.040691759918616


plot_sample_means(400)

Sample size:  400
Population mean: 16.6581555154
Average of sample means:  16.69335
Population SD: 39.4801998516
SD of sample means: 1.97027797817


39.48 / 1.973

20.010136847440442


plot_sample_means(625)

Sample size:  625
Population mean: 16.6581555154
Average of sample means:  16.64441152
Population SD: 39.4801998516
SD of sample means: 1.56524719602


39.48 / 1.577

25.034876347495242


39.48 / np.sqrt(100)

3.9479999999999995


39.48 / np.sqrt(400)

1.9739999999999998


39.48 / np.sqrt(625)

1.5791999999999999


# Warning: this cell will take a long time to run!
sample_sizes = np.arange(100, 950, 50)

sample_mean_sds = make_array()
for n in sample_sizes:
    sample_means = ten_thousand_sample_means(n)
    sample_mean_sds = np.append(sample_mean_sds, np.std(sample_means))


sd_table = Table().with_columns(
    'Sample size', sample_sizes,
    'SD of simulated sample means', sample_mean_sds,
    'Pop SD / sqrt(sample size)', population_sd / np.sqrt(sample_sizes),
)
sd_table


sd_table.iscatter('Sample size')


sf_salaries = Table.read_table("san_francisco_2019.csv")
sf_salaries.hist("Salary")


def one_sample_mean(sample_size):
    """Take a sample from the population of flights and compute its mean"""
    sampled_salaries = sf_salaries.sample(sample_size)
    return np.mean(sampled_salaries.column('Salary'))


def ten_thousand_sample_means(sample_size):
    """Approximate the distribution of the sample mean"""
    means = make_array()
    for i in np.arange(10000):
        mean = one_sample_mean(sample_size)
        means = np.append(means, mean)
    return means


sample_means_400 = ten_thousand_sample_means(400)
Table().with_column('Mean of 400 salaries', sample_means_400).hist(bins=20)
print('Population Average:', np.mean(sf_salaries.column("Salary")))

Population Average: 80608.7076025

Sample size	SD of simulated sample means	Pop SD / sqrt(sample size)
100	3.92947	3.94802
150	3.23346	3.22354
200	2.80173	2.79167
250	2.50378	2.49695
300	2.28177	2.27939
350	2.1184	2.11031
400	1.97849	1.97401
450	1.84082	1.86111
500	1.75102	1.76561
550	1.67135	1.68344

Distribution of the Sample Average¶

Relationship Between Population SD and Sample Size¶

Variability of the Sample Mean¶

Other distributions¶