from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')


prizes = make_array("goat", "goat", "car")


N = 10_000
outcomes = Table().with_column("My Choice", np.random.choice(prizes, N))
outcomes


outcomes = outcomes.with_column("Monty's Door", "goat")
outcomes


def other_door(my_choice):
    if my_choice == "car":
        return "goat"
    else:
        return "car"


outcomes = outcomes.with_column(
    "Other Door", outcomes.apply(other_door, "My Choice"))
outcomes


outcomes.group("My Choice").barh("My Choice")


outcomes.group("Other Door").barh("Other Door")


united = Table.read_table('united.csv')
united = ( # Adding row numbers so we can see samples more easily
    united
    .with_column('Row', np.arange(united.num_rows))
    .move_to_start('Row') 
)
united


united.where('Destination', 'JFK')


united.sample(3, with_replacement=True)


(
    united
    .where('Destination', 'JFK')
    .sample(3, with_replacement=True)
)


start = np.random.choice(np.arange(1000))
systematic_sample = united.take(np.arange(start, united.num_rows, 5000))
systematic_sample.show()


die = Table().with_column('Face', np.arange(1, 7))
die


roll_bins = np.arange(0.5, 6.6, 1)
die.hist(bins=roll_bins)


die.sample(3)


die.sample(10).hist(bins=roll_bins)


die.sample(100).hist(bins=roll_bins)


die.sample(100_000).hist(bins=roll_bins)


united.num_rows

13825


united.hist('Delay', bins = 50)


united.sort('Delay', descending=True)


united_bins = np.arange(-20, 201, 5)
united.hist('Delay', bins = united_bins)


united.sample(10).hist('Delay', bins = united_bins)


united.sample(1000).hist('Delay', bins = united_bins)


united.sample(2000).hist('Delay', bins = united_bins)


np.median(united.column('Delay'))

2.0


np.median(united.sample(10).column('Delay'))

0.0


def sample_median(size):
    return np.median(united.sample(size).column('Delay'))


sample_median(10)

-0.5


sample_medians = make_array()

for i in np.arange(1000):
    new_median = sample_median(10)
    sample_medians = np.append(sample_medians, new_median)


medians = Table().with_columns(
    "Sample Medians", sample_medians,
    "Sample Size", 10)
medians.hist("Sample Medians", bins = 50)


sample_medians2 = make_array()

for i in np.arange(1000):
    new_median = sample_median(1000)
    sample_medians2 = np.append(sample_medians2, new_median)


medians.append(Table().with_columns(
    "Sample Medians", sample_medians2,
    "Sample Size", 1000)).hist("Sample Medians", group="Sample Size", bins=50)

Row	Date	Flight Number	Destination	Delay
0	6/1/15	73	HNL	257
1	6/1/15	217	EWR	28
2	6/1/15	237	STL	-3
3	6/1/15	250	SAN	0
4	6/1/15	267	PHL	64
5	6/1/15	273	SEA	-6
6	6/1/15	278	SEA	-8
7	6/1/15	292	EWR	12
8	6/1/15	300	HNL	20
9	6/1/15	317	IND	-10

Row	Date	Flight Number	Destination	Delay
26	6/1/15	502	JFK	-4
33	6/1/15	637	JFK	141
39	6/1/15	704	JFK	-8
50	6/1/15	758	JFK	-5
51	6/1/15	760	JFK	352
56	6/1/15	824	JFK	3
57	6/1/15	898	JFK	290
179	6/2/15	502	JFK	0
188	6/2/15	637	JFK	202
194	6/2/15	704	JFK	-11

Row	Date	Flight Number	Destination	Delay
7063	7/18/15	329	DEN	-5
2302	6/16/15	759	EWR	90
5593	7/8/15	1454	DEN	-7

Row	Date	Flight Number	Destination	Delay
2603	6/18/15	637	JFK	18
8291	7/26/15	637	JFK	61
3336	6/23/15	502	JFK	40

Row	Date	Flight Number	Destination	Delay
70	6/1/15	1124	LAS	-1
5070	7/5/15	599	BOS	-7
10070	8/6/15	1641	IAD	-2

Lecture 15¶

The Monty Hall Problem¶

Random Sampling¶

Distributions¶

Large Random Samples¶

Simulating Statistics¶

My Choice	Monty's Door
goat	goat
car	goat
goat	goat
goat	goat
goat	goat
car	goat
goat	goat
goat	goat
goat	goat
goat	goat

Row	Date	Flight Number	Destination	Delay
3140	6/21/15	1964	SEA	580
3154	6/22/15	300	HNL	537
3069	6/21/15	1149	IAD	508
2888	6/20/15	353	ORD	505
12627	8/23/15	1589	ORD	458
7949	7/23/15	1960	LAX	438
3412	6/23/15	1606	ORD	430
578	6/4/15	1743	LAX	408
2474	6/17/15	1122	HNL	405
8426	7/27/15	572	ORD	385

Face
1
2
3
4
5
6

Face
4
5
6