from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
scores = Table.read_table('scores_by_section.csv')
scores
Section | Midterm |
---|---|
1 | 22 |
2 | 12 |
2 | 23 |
2 | 14 |
1 | 20 |
3 | 25 |
4 | 19 |
1 | 24 |
5 | 8 |
6 | 14 |
... (349 rows omitted)
scores.group('Section')
Section | count |
---|---|
1 | 32 |
2 | 32 |
3 | 27 |
4 | 30 |
5 | 33 |
6 | 32 |
7 | 24 |
8 | 29 |
9 | 30 |
10 | 34 |
... (2 rows omitted)
scores.group('Section', np.average).show()
Section | Midterm average |
---|---|
1 | 15.5938 |
2 | 15.125 |
3 | 13.6667 |
4 | 14.7667 |
5 | 17.4545 |
6 | 15.0312 |
7 | 16.625 |
8 | 16.3103 |
9 | 14.5667 |
10 | 15.2353 |
11 | 15.8077 |
12 | 15.7333 |
observed_average = 13.6667
random_sample = scores.sample(27, with_replacement=False)
random_sample
Section | Midterm |
---|---|
6 | 17 |
11 | 19 |
2 | 5 |
8 | 20 |
4 | 11 |
11 | 13 |
8 | 19 |
6 | 16 |
5 | 24 |
12 | 23 |
... (17 rows omitted)
np.average(random_sample.column('Midterm'))
17.333333333333332
# Simulate one value of the test statistic
# under the hypothesis that the section is like a random sample from the class
def random_sample_midterm_avg():
random_sample = scores.sample(27, with_replacement = False)
return np.average(random_sample.column('Midterm'))
# Simulate 50,000 copies of the test statistic
sample_averages = make_array()
for i in np.arange(50000):
sample_averages = np.append(sample_averages, random_sample_midterm_avg())
# Compare the simulated distribution of the statistic
# and the actual observed statistic
averages_tbl = Table().with_column('Random Sample Average', sample_averages)
averages_tbl.hist(bins = 20)
plots.scatter(observed_average, -0.01, color='red', s=120);
# (1) Calculate the p-value: simulation area beyond observed value
np.count_nonzero(sample_averages <= observed_average) / 50000
# (2) See if this is less than 5%
0.05612
# (1) Find simulated value corresponding to 5% of 50,000 = 2500
five_percent_point = averages_tbl.sort(0).column(0).item(2500)
five_percent_point
13.62962962962963
# (2) See if this value is greater than observed value
observed_average
13.6667
averages_tbl.hist(bins = 20)
plots.plot([five_percent_point, five_percent_point], [0, 0.35], color='gold', lw=2)
plots.title('Area to the left of the gold line: 5%');
plots.scatter(observed_average, -0.01, color='red', s=120);