from datascience import *
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
# Predefined functions; they should look familiar to functions you've coded in assignments!
def standard_units(arr):
return (arr - np.average(arr))/np.std(arr)
def correlation(t, x, y):
x_standard = standard_units(t.column(x))
y_standard = standard_units(t.column(y))
return np.average(x_standard * y_standard)
def slope(t, x, y):
r = correlation(t, x, y)
y_sd = np.std(t.column(y))
x_sd = np.std(t.column(x))
return r * y_sd / x_sd
def intercept(t, x, y):
x_mean = np.mean(t.column(x))
y_mean = np.mean(t.column(y))
return y_mean - slope(t, x, y)*x_mean
def fitted_values(t, x, y):
"""Return an array of the regression estimates at all the x values"""
a = slope(t, x, y)
b = intercept(t, x, y)
return a*t.column(x) + b
def residuals(t, x, y):
"""Return an array of all the residuals"""
predictions = fitted_values(t, x, y)
return t.column(y) - predictions
# Ignore this code; it produces plots for demonstrating the regression model
def draw_and_compare(true_slope, true_int, sample_size):
x = np.random.normal(50, 5, sample_size)
xlims = np.array([np.min(x), np.max(x)])
errors = np.random.normal(0, 6, sample_size)
y = (true_slope * x + true_int) + errors
sample = Table().with_columns('x', x, 'y', y)
sample.scatter('x', 'y')
plots.plot(xlims, true_slope*xlims + true_int, lw=2, color='green')
plots.title('True Line, and Points Created')
sample.scatter('x', 'y')
plots.title('What We Get to See')
sample.scatter('x', 'y', fit_line=True)
plots.title('Regression Line: Estimate of True Line')
sample.scatter('x', 'y', fit_line=True)
plots.plot(xlims, true_slope*xlims + true_int, lw=2, color='green')
plots.title("Regression Line and True Line")
draw_and_compare(2, -5, 10)
draw_and_compare(2, -5, 10)
draw_and_compare(2, -5, 100)
draw_and_compare(2, -5, 100)
births = Table.read_table('baby.csv')
births.show(3)
Birth Weight | Gestational Days | Maternal Age | Maternal Height | Maternal Pregnancy Weight | Maternal Smoker |
---|---|---|---|---|---|
120 | 284 | 27 | 62 | 100 | False |
113 | 282 | 33 | 64 | 135 | False |
128 | 279 | 28 | 64 | 115 | True |
... (1171 rows omitted)
# Preterm and postterm pregnancy cutoffs, according to the CDC
37 * 7, 42 * 7
(259, 294)
births.scatter('Gestational Days', 'Birth Weight')
births = births.where('Gestational Days', are.between(225, 325))
births.scatter('Gestational Days', 'Birth Weight')
correlation(births, 'Gestational Days', 'Birth Weight')
0.42295118452423991
births.scatter('Gestational Days', 'Birth Weight', fit_line=True)
def prediction_at(t, x, y, x_value):
'''
t - table
x - label of x column
y - label of y column
x_value - the x value for which we want to predict y
'''
return slope(t, x, y) * x_value + intercept(t, x, y)
prediction_at_300 = prediction_at(births, 'Gestational Days', 'Birth Weight', 300)
prediction_at_300
130.80951674248769
x = 300
births.scatter('Gestational Days', 'Birth Weight', fit_line=True)
plots.plot([x, x], [40, prediction_at_300], color='red', lw=2);
# You don't need to understand the plotting code in this cell,
# but you should understand the figure that comes out.
plots.figure(figsize=(10, 11))
plots.subplot(3, 2, 1)
plots.scatter(births[1], births[0], s=10, color='darkblue')
plots.xlim([225, 325])
plots.title('Original sample')
for i in np.arange(1, 6, 1):
plots.subplot(3,2,i+1)
resampled = births.sample()
plots.scatter(resampled.column('Gestational Days'), resampled.column('Birth Weight'), s=10, color='tab:green')
plots.xlim([225, 325])
plots.title('Bootstrap sample '+str(i))
plots.tight_layout()
for i in np.arange(4):
resample = births.sample()
predicted_y = prediction_at(resample, 'Gestational Days', 'Birth Weight', 300)
print('Predicted y from bootstrap sample was', predicted_y)
resample.scatter('Gestational Days', 'Birth Weight', fit_line=True)
plots.scatter(300, predicted_y, color='gold', s=50, zorder=3);
plots.plot([x, x], [40, predicted_y], color='red', lw=2);
plots.plot([200, x], [predicted_y, predicted_y], color='red', lw=2);
Predicted y from bootstrap sample was 130.390184934 Predicted y from bootstrap sample was 129.185230185 Predicted y from bootstrap sample was 129.963277846 Predicted y from bootstrap sample was 131.997919642
lines = Table(['slope','intercept', 'at 210', 'at 300', 'at 320'])
for i in range(10):
resample = births.sample()
a = slope(resample, 'Gestational Days', 'Birth Weight')
b = intercept(resample, 'Gestational Days', 'Birth Weight')
lines.append([a, b, a * 210 + b, a * 300 + b, a * 320 + b])
lines
slope | intercept | at 210 | at 300 | at 320 |
---|---|---|---|---|
0.541702 | -31.4892 | 82.2683 | 131.021 | 141.856 |
0.482094 | -14.8571 | 86.3826 | 129.771 | 139.413 |
0.533682 | -29.0976 | 82.9757 | 131.007 | 141.681 |
0.531037 | -28.4185 | 83.0994 | 130.893 | 141.513 |
0.491476 | -17.7542 | 85.4558 | 129.689 | 139.518 |
0.489538 | -16.5877 | 86.2154 | 130.274 | 140.065 |
0.552017 | -35.138 | 80.7855 | 130.467 | 141.507 |
0.523219 | -27.2175 | 82.6584 | 129.748 | 140.212 |
0.591405 | -45.2561 | 78.939 | 132.165 | 143.994 |
0.496462 | -19.307 | 84.95 | 129.632 | 139.561 |
for i in np.arange(lines.num_rows):
line = lines.row(i)
plots.plot([210, 320], [line.item('at 210'), line.item('at 320')], lw=1)
plots.scatter(300, line.item('at 300'), s=30, zorder=3)
np.mean(births.column('Gestational Days')), np.mean(births.column('Birth Weight'))
(279.11015490533561, 119.57401032702238)
lines = Table(['slope','intercept', 'at 291', 'at 300', 'at 309'])
for i in range(10):
resample = births.sample()
a = slope(resample, 'Gestational Days', 'Birth Weight')
b = intercept(resample, 'Gestational Days', 'Birth Weight')
lines.append([a, b, a * 291 + b, a * 300 + b, a * 309 + b])
lines
slope | intercept | at 291 | at 300 | at 309 |
---|---|---|---|---|
0.575427 | -40.8788 | 126.571 | 131.749 | 136.928 |
0.545005 | -33.0862 | 125.51 | 130.415 | 135.32 |
0.576423 | -41.8477 | 125.891 | 131.079 | 136.267 |
0.521728 | -26.4287 | 125.394 | 130.09 | 134.785 |
0.575251 | -40.9346 | 126.464 | 131.641 | 136.818 |
0.547634 | -33.5314 | 125.83 | 130.759 | 135.688 |
0.577011 | -41.1714 | 126.739 | 131.932 | 137.125 |
0.529463 | -28.938 | 125.136 | 129.901 | 134.666 |
0.500984 | -20.3506 | 125.436 | 129.945 | 134.453 |
0.507768 | -23.0621 | 124.698 | 129.268 | 133.838 |
for i in np.arange(lines.num_rows):
line = lines.row(i)
plots.plot([291, 309], [line.item('at 291'), line.item('at 309')], lw=1)
plots.scatter(300, line.item('at 300'), s=30, zorder=3)
def bootstrap_prediction(t, x, y, new_x, repetitions=2500):
"""
Makes a 95% confidence interval for the height of the true line at new_x,
using linear regression on the data in t (column names x and y).
Shows a histogram of the bootstrap samples and shows the interval
in gold.
"""
# Bootstrap the scatter, predict, collect
predictions = make_array()
for i in np.arange(repetitions):
resample = t.sample()
predicted_y = prediction_at(resample, x, y, new_x)
predictions = np.append(predictions, predicted_y)
# Find the ends of the approximate 95% prediction interval
left = percentile(2.5, predictions)
right = percentile(97.5, predictions)
round_left = round(left, 3)
round_right = round(right, 3)
# Display results
Table().with_column('Prediction', predictions).hist(bins=20)
plots.xlabel('predictions at x='+str(new_x))
plots.plot([left, right], [0, 0], color='yellow', lw=8);
print('Approximate 95%-confidence interval for height of true line at x =', new_x)
print(round_left, 'to', round_right, '( width =', round(right - left, 3), ')')
bootstrap_prediction(births, 'Gestational Days', 'Birth Weight', 300)
Approximate 95%-confidence interval for height of true line at x = 300 128.944 to 132.793 ( width = 3.849 )
x = 300
births.scatter('Gestational Days', 'Birth Weight', fit_line=True)
plots.plot([x, x], [40, prediction_at_300], color='blue', lw=2);
prediction_at_230 = prediction_at(births, 'Gestational Days', 'Birth Weight', 230)
plots.plot([230, 230], [40, prediction_at_230], color='orange', lw=2);
prediction_at_280 = prediction_at(births, 'Gestational Days', 'Birth Weight', 280)
plots.plot([280, 280], [40, prediction_at_280], color='black', lw=2);
bootstrap_prediction(births, 'Gestational Days', 'Birth Weight', 230)
Approximate 95%-confidence interval for height of true line at x = 230 89.205 to 97.263 ( width = 8.057 )
bootstrap_prediction(births, 'Gestational Days', 'Birth Weight', 280)
Approximate 95%-confidence interval for height of true line at x = 280 119.098 to 121.046 ( width = 1.948 )
# No need to follow the code in this cell; just understand the graph
lines = Table(['slope','intercept', 'at 210', 'at 300', 'at 320'])
for i in range(10):
resample = births.sample()
a = slope(resample, 'Gestational Days', 'Birth Weight')
b = intercept(resample, 'Gestational Days', 'Birth Weight')
lines.append([a, b, a * 210 + b, a * 300 + b, a * 320 + b])
for i in np.arange(lines.num_rows):
line = lines.row(i)
plots.plot([210, 320], [line.item('at 210'), line.item('at 320')], lw=1)
births.scatter('Gestational Days', 'Birth Weight', fit_line=True)
slope(births, 'Gestational Days', 'Birth Weight')
0.53784536766790358
def bootstrap_slope(t, x, y, repetitions=2500):
"""
Makes a 95% confidence interval for the slope of the true line,
using linear regression on the data in t (column names x and y).
Shows a histogram of the bootstrap samples and shows the interval
in gold.
"""
# Bootstrap the scatter, find the slope, collect
slopes = make_array()
for i in np.arange(repetitions):
bootstrap_sample = t.sample()
bootstrap_slope = slope(bootstrap_sample, x, y)
slopes = np.append(slopes, bootstrap_slope)
# Find the endpoints of the 95% confidence interval for the true slope
left = percentile(2.5, slopes)
right = percentile(97.5, slopes)
round_left = round(left, 3)
round_right = round(right, 3)
# Slope of the regression line from the original sample
observed_slope = slope(t, x, y)
# Display results (no need to follow this code)
Table().with_column('Bootstrap Slopes', slopes).hist(bins=20)
plots.plot(make_array(left, right), make_array(0, 0), color='yellow', lw=8);
print('Slope of regression line:', round(observed_slope, 3))
print('Approximate 95%-confidence interval for the slope of the true line:')
print(round_left, 'to', round_right)
bootstrap_slope(births, 'Gestational Days', 'Birth Weight')
Slope of regression line: 0.538 Approximate 95%-confidence interval for the slope of the true line: 0.456 to 0.614
draw_and_compare(0, 10, 15)
births.scatter('Maternal Age', 'Birth Weight')
slope(births, 'Maternal Age', 'Birth Weight')
0.095142237298344659
births.scatter('Maternal Age', 'Birth Weight', fit_line=True)
Null: Slope of true line is equal to 0.
Alternative: Slope of true line is not equal to 0.
bootstrap_slope(births, 'Maternal Age', 'Birth Weight')
Slope of regression line: 0.095 Approximate 95%-confidence interval for the slope of the true line: -0.091 to 0.283