import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')


exams = Table.read_table('exams_fa18.csv')
exams.show(5)


exams.hist(overlay=False, bins=np.arange(0,101,5))


def standard_units(x):
    """Convert array of values to standard units"""
    return (x - np.average(x)) / np.std(x)


midterm_su = standard_units(exams.column('Midterm'))
exams = exams.with_column('Midterm in Standard Units', midterm_su)

final_su = standard_units(exams.column('Final'))
exams = exams.with_column('Final in Standard Units', final_su)

exams.show(10)


exams.select(
    'Midterm in Standard Units', 'Final in Standard Units'
).hist(overlay=False, bins=np.arange(-4,2,0.1))


births = Table.read_table('baby.csv')


births.hist('Maternal Height', bins = np.arange(56.5, 72.6, 1))


heights = births.column('Maternal Height')
np.mean(heights), np.std(heights)

(64.049403747870528, 2.5250254409674375)


# 1 SD below & above the mean
np.mean(heights) - np.std(heights), np.mean(heights) + np.std(heights)

(61.524378306903088, 66.574429188837968)


united = Table.read_table('united.csv')
united


united_bins = np.arange(-20, 300, 10)
united.hist('Delay', bins=united_bins)


delays = united.column('Delay')
delay_mean = np.mean(delays)
delay_sd = np.std(delays)
delay_mean, delay_sd

(16.658155515370705, 39.480199851609314)


percentile(50, delays)

2


def one_sample_mean(sample_size):
    """ 
    Takes a sample from the population of flights 
    and computes its mean
    """
    sampled_flights = united.sample(sample_size)
    return np.mean(sampled_flights.column('Delay'))


one_sample_mean(100)

20.780000000000001


def ten_thousand_sample_means(sample_size):
    means = make_array()
    for i in np.arange(10000):
        mean = one_sample_mean(sample_size)
        means = np.append(means, mean)
    return means


sample_means_100 = ten_thousand_sample_means(100)


sample_means_100

array([ 13.26,  13.2 ,  20.8 , ...,  11.17,  19.01,  16.86])


len(sample_means_100)

10000


Table().with_column(
    'Mean of 100 flight delays', sample_means_100).hist(bins=20)

print('Population Average:', delay_mean)

Population Average: 16.6581555154


sample_means_400 = ten_thousand_sample_means(400)
Table().with_column(
    'Mean of 400 flight delays', sample_means_400).hist(bins=20)

print('Population Average:', delay_mean)

Population Average: 16.6581555154

Midterm	Final	Midterm in Standard Units	Final in Standard Units
91	88	1.16278	0.717241
89.5	84	1.03032	0.390935
78	71.5	0.0147889	-0.628768
87	88	0.809552	0.717241
72	72.5	-0.515053	-0.547191
71.5	61	-0.559206	-1.48532
82	92.5	0.368017	1.08433
84.5	87	0.588784	0.635664
85	92	0.632938	1.04355
78	71	0.0147889	-0.669556

Date	Flight Number	Destination	Delay
6/1/15	73	HNL	257
6/1/15	217	EWR	28
6/1/15	237	STL	-3
6/1/15	250	SAN	0
6/1/15	267	PHL	64
6/1/15	273	SEA	-6
6/1/15	278	SEA	-8
6/1/15	292	EWR	12
6/1/15	300	HNL	20
6/1/15	317	IND	-10

Standard Units¶

The SD and Bell Shaped Curves¶

Central Limit Theorem¶