import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')
exams = Table.read_table('exams_fa18.csv')
exams.show(5)
Midterm | Final |
---|---|
91 | 88 |
89.5 | 84 |
78 | 71.5 |
87 | 88 |
72 | 72.5 |
... (1237 rows omitted)
exams.hist(overlay=False, bins=np.arange(0,101,5))
def standard_units(x):
"""Convert array of values to standard units"""
return (x - np.average(x)) / np.std(x)
midterm_su = standard_units(exams.column('Midterm'))
exams = exams.with_column('Midterm in Standard Units', midterm_su)
final_su = standard_units(exams.column('Final'))
exams = exams.with_column('Final in Standard Units', final_su)
exams.show(10)
Midterm | Final | Midterm in Standard Units | Final in Standard Units |
---|---|---|---|
91 | 88 | 1.16278 | 0.717241 |
89.5 | 84 | 1.03032 | 0.390935 |
78 | 71.5 | 0.0147889 | -0.628768 |
87 | 88 | 0.809552 | 0.717241 |
72 | 72.5 | -0.515053 | -0.547191 |
71.5 | 61 | -0.559206 | -1.48532 |
82 | 92.5 | 0.368017 | 1.08433 |
84.5 | 87 | 0.588784 | 0.635664 |
85 | 92 | 0.632938 | 1.04355 |
78 | 71 | 0.0147889 | -0.669556 |
... (1232 rows omitted)
exams.select(
'Midterm in Standard Units', 'Final in Standard Units'
).hist(overlay=False, bins=np.arange(-4,2,0.1))
births = Table.read_table('baby.csv')
births.hist('Maternal Height', bins = np.arange(56.5, 72.6, 1))
heights = births.column('Maternal Height')
np.mean(heights), np.std(heights)
(64.049403747870528, 2.5250254409674375)
# 1 SD below & above the mean
np.mean(heights) - np.std(heights), np.mean(heights) + np.std(heights)
(61.524378306903088, 66.574429188837968)
united = Table.read_table('united.csv')
united
Date | Flight Number | Destination | Delay |
---|---|---|---|
6/1/15 | 73 | HNL | 257 |
6/1/15 | 217 | EWR | 28 |
6/1/15 | 237 | STL | -3 |
6/1/15 | 250 | SAN | 0 |
6/1/15 | 267 | PHL | 64 |
6/1/15 | 273 | SEA | -6 |
6/1/15 | 278 | SEA | -8 |
6/1/15 | 292 | EWR | 12 |
6/1/15 | 300 | HNL | 20 |
6/1/15 | 317 | IND | -10 |
... (13815 rows omitted)
united_bins = np.arange(-20, 300, 10)
united.hist('Delay', bins=united_bins)
delays = united.column('Delay')
delay_mean = np.mean(delays)
delay_sd = np.std(delays)
delay_mean, delay_sd
(16.658155515370705, 39.480199851609314)
percentile(50, delays)
2
def one_sample_mean(sample_size):
"""
Takes a sample from the population of flights
and computes its mean
"""
sampled_flights = united.sample(sample_size)
return np.mean(sampled_flights.column('Delay'))
one_sample_mean(100)
20.780000000000001
def ten_thousand_sample_means(sample_size):
means = make_array()
for i in np.arange(10000):
mean = one_sample_mean(sample_size)
means = np.append(means, mean)
return means
sample_means_100 = ten_thousand_sample_means(100)
sample_means_100
array([ 13.26, 13.2 , 20.8 , ..., 11.17, 19.01, 16.86])
len(sample_means_100)
10000
Table().with_column(
'Mean of 100 flight delays', sample_means_100).hist(bins=20)
print('Population Average:', delay_mean)
Population Average: 16.6581555154
sample_means_400 = ten_thousand_sample_means(400)
Table().with_column(
'Mean of 400 flight delays', sample_means_400).hist(bins=20)
print('Population Average:', delay_mean)
Population Average: 16.6581555154