In [1]:
from datascience import *
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
In [2]:
# Predefined functions; they should look familiar to functions you've coded in assignments!
def standard_units(arr):
    return (arr - np.average(arr))/np.std(arr)

def correlation(t, x, y):
    x_standard = standard_units(t.column(x))
    y_standard = standard_units(t.column(y))
    return np.average(x_standard * y_standard)

def slope(t, x, y):
    r = correlation(t, x, y)
    y_sd = np.std(t.column(y))
    x_sd = np.std(t.column(x))
    return r * y_sd / x_sd

def intercept(t, x, y):
    x_mean = np.mean(t.column(x))
    y_mean = np.mean(t.column(y))
    return y_mean - slope(t, x, y)*x_mean

def fitted_values(t, x, y):
    """Return an array of the regression estimates at all the x values"""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a*t.column(x) + b

def residuals(t, x, y):
    """Return an array of all the residuals"""
    predictions = fitted_values(t, x, y)
    return t.column(y) - predictions

Regression Model (Demo Slide 10)¶

In [3]:
# Ignore this code; it produces plots for demonstrating the regression model
def draw_and_compare(true_slope, true_int, sample_size):
    x = np.random.normal(50, 5, sample_size)
    xlims = np.array([np.min(x), np.max(x)])
    errors = np.random.normal(0, 6, sample_size)
    y = (true_slope * x + true_int) + errors
    sample = Table().with_columns('x', x, 'y', y)

    sample.scatter('x', 'y')
    plots.plot(xlims, true_slope*xlims + true_int, lw=2, color='green')
    plots.title('True Line, and Points Created')

    sample.scatter('x', 'y')
    plots.title('What We Get to See')

    sample.scatter('x', 'y', fit_line=True)
    plots.title('Regression Line: Estimate of True Line')

    sample.scatter('x', 'y', fit_line=True)
    plots.plot(xlims, true_slope*xlims + true_int, lw=2, color='green')
    plots.title("Regression Line and True Line")
In [4]:
draw_and_compare(2, -5, 10)
In [5]:
draw_and_compare(2, -5, 10)
In [6]:
draw_and_compare(2, -5, 100)
In [7]:
draw_and_compare(2, -5, 100)





Return to Slides




Prediction (Demo Slide 11)¶

In [8]:
births = Table.read_table('baby.csv')
births.show(3)
Birth Weight Gestational Days Maternal Age Maternal Height Maternal Pregnancy Weight Maternal Smoker
120 284 27 62 100 False
113 282 33 64 135 False
128 279 28 64 115 True

... (1171 rows omitted)

In [9]:
# Preterm and postterm pregnancy cutoffs, according to the CDC
37 * 7, 42 * 7
Out[9]:
(259, 294)
In [10]:
births.scatter('Gestational Days', 'Birth Weight')
In [11]:
births = births.where('Gestational Days', are.between(225, 325))
In [12]:
births.scatter('Gestational Days', 'Birth Weight')

Suppose we assume the regression model¶

In [13]:
correlation(births, 'Gestational Days', 'Birth Weight')
Out[13]:
0.42295118452423991
In [14]:
births.scatter('Gestational Days', 'Birth Weight', fit_line=True)

Prediction at a Given Value of x¶

In [15]:
def prediction_at(t, x, y, x_value):
    '''
    t - table
    x - label of x column
    y - label of y column
    x_value - the x value for which we want to predict y
    '''
    return slope(t, x, y) * x_value + intercept(t, x, y)
In [16]:
prediction_at_300 = prediction_at(births, 'Gestational Days', 'Birth Weight', 300)
prediction_at_300
Out[16]:
130.80951674248769
In [17]:
x = 300
births.scatter('Gestational Days', 'Birth Weight', fit_line=True)
plots.plot([x, x], [40, prediction_at_300], color='red', lw=2);





Return to Slides




Bootstrapping the Sample (Demo Slide 15)¶

In [18]:
# You don't need to understand the plotting code in this cell,
# but you should understand the figure that comes out.

plots.figure(figsize=(10, 11))
plots.subplot(3, 2, 1)
plots.scatter(births[1], births[0], s=10, color='darkblue')
plots.xlim([225, 325])
plots.title('Original sample')

for i in np.arange(1, 6, 1):
    plots.subplot(3,2,i+1)
    resampled = births.sample()
    plots.scatter(resampled.column('Gestational Days'), resampled.column('Birth Weight'), s=10, color='tab:green')
    plots.xlim([225, 325])
    plots.title('Bootstrap sample '+str(i))
plots.tight_layout()
In [19]:
for i in np.arange(4):
    resample = births.sample()
    predicted_y = prediction_at(resample, 'Gestational Days', 'Birth Weight', 300)
    print('Predicted y from bootstrap sample was', predicted_y)
    resample.scatter('Gestational Days', 'Birth Weight', fit_line=True)
    plots.scatter(300, predicted_y, color='gold', s=50, zorder=3);
    plots.plot([x, x], [40, predicted_y], color='red', lw=2);
    plots.plot([200, x], [predicted_y, predicted_y], color='red', lw=2);
Predicted y from bootstrap sample was 130.390184934
Predicted y from bootstrap sample was 129.185230185
Predicted y from bootstrap sample was 129.963277846
Predicted y from bootstrap sample was 131.997919642
In [20]:
lines = Table(['slope','intercept', 'at 210', 'at 300', 'at 320'])

for i in range(10):
    resample = births.sample()
    a = slope(resample, 'Gestational Days', 'Birth Weight')
    b = intercept(resample, 'Gestational Days', 'Birth Weight')
    lines.append([a, b, a * 210 + b, a * 300 + b, a * 320 + b])
lines
Out[20]:
slope intercept at 210 at 300 at 320
0.541702 -31.4892 82.2683 131.021 141.856
0.482094 -14.8571 86.3826 129.771 139.413
0.533682 -29.0976 82.9757 131.007 141.681
0.531037 -28.4185 83.0994 130.893 141.513
0.491476 -17.7542 85.4558 129.689 139.518
0.489538 -16.5877 86.2154 130.274 140.065
0.552017 -35.138 80.7855 130.467 141.507
0.523219 -27.2175 82.6584 129.748 140.212
0.591405 -45.2561 78.939 132.165 143.994
0.496462 -19.307 84.95 129.632 139.561
In [21]:
for i in np.arange(lines.num_rows):
    line = lines.row(i)
    plots.plot([210, 320], [line.item('at 210'), line.item('at 320')], lw=1)
    plots.scatter(300, line.item('at 300'), s=30, zorder=3)
In [22]:
np.mean(births.column('Gestational Days')), np.mean(births.column('Birth Weight'))
Out[22]:
(279.11015490533561, 119.57401032702238)
In [23]:
lines = Table(['slope','intercept', 'at 291', 'at 300', 'at 309'])

for i in range(10):
    resample = births.sample()
    a = slope(resample, 'Gestational Days', 'Birth Weight')
    b = intercept(resample, 'Gestational Days', 'Birth Weight')
    lines.append([a, b, a * 291 + b, a * 300 + b, a * 309 + b])
lines
Out[23]:
slope intercept at 291 at 300 at 309
0.575427 -40.8788 126.571 131.749 136.928
0.545005 -33.0862 125.51 130.415 135.32
0.576423 -41.8477 125.891 131.079 136.267
0.521728 -26.4287 125.394 130.09 134.785
0.575251 -40.9346 126.464 131.641 136.818
0.547634 -33.5314 125.83 130.759 135.688
0.577011 -41.1714 126.739 131.932 137.125
0.529463 -28.938 125.136 129.901 134.666
0.500984 -20.3506 125.436 129.945 134.453
0.507768 -23.0621 124.698 129.268 133.838
In [24]:
for i in np.arange(lines.num_rows):
    line = lines.row(i)
    plots.plot([291, 309], [line.item('at 291'), line.item('at 309')], lw=1)
    plots.scatter(300, line.item('at 300'), s=30, zorder=3)

Prediction Interval¶

In [25]:
def bootstrap_prediction(t, x, y, new_x, repetitions=2500):
    """ 
    Makes a 95% confidence interval for the height of the true line at new_x, 
    using linear regression on the data in t (column names x and y).
    Shows a histogram of the bootstrap samples and shows the interval
    in gold.
    """

    # Bootstrap the scatter, predict, collect
    predictions = make_array()
    for i in np.arange(repetitions):
        resample = t.sample()
        predicted_y = prediction_at(resample, x, y, new_x)
        predictions = np.append(predictions, predicted_y)

    # Find the ends of the approximate 95% prediction interval
    left = percentile(2.5, predictions)
    right = percentile(97.5, predictions)
    round_left = round(left, 3)
    round_right = round(right, 3)

    # Display results
    Table().with_column('Prediction', predictions).hist(bins=20)
    plots.xlabel('predictions at x='+str(new_x))
    plots.plot([left, right], [0, 0], color='yellow', lw=8);
    print('Approximate 95%-confidence interval for height of true line at x =', new_x)
    print(round_left, 'to', round_right, '( width =', round(right - left, 3), ')') 
In [26]:
bootstrap_prediction(births, 'Gestational Days', 'Birth Weight', 300)
Approximate 95%-confidence interval for height of true line at x = 300
128.944 to 132.793 ( width = 3.849 )





Return to Slides




Predictions at Different Values of x (Demo Slide 16)¶

In [27]:
x = 300
births.scatter('Gestational Days', 'Birth Weight', fit_line=True)
plots.plot([x, x], [40, prediction_at_300], color='blue', lw=2);
prediction_at_230 = prediction_at(births, 'Gestational Days', 'Birth Weight', 230)
plots.plot([230, 230], [40, prediction_at_230], color='orange', lw=2);
prediction_at_280 = prediction_at(births, 'Gestational Days', 'Birth Weight', 280)
plots.plot([280, 280], [40, prediction_at_280], color='black', lw=2);
In [28]:
bootstrap_prediction(births, 'Gestational Days', 'Birth Weight', 230)
Approximate 95%-confidence interval for height of true line at x = 230
89.205 to 97.263 ( width = 8.057 )
In [29]:
bootstrap_prediction(births, 'Gestational Days', 'Birth Weight', 280)
Approximate 95%-confidence interval for height of true line at x = 280
119.098 to 121.046 ( width = 1.948 )
In [30]:
# No need to follow the code in this cell; just understand the graph

lines = Table(['slope','intercept', 'at 210', 'at 300', 'at 320'])

for i in range(10):
    resample = births.sample()
    a = slope(resample, 'Gestational Days', 'Birth Weight')
    b = intercept(resample, 'Gestational Days', 'Birth Weight')
    lines.append([a, b, a * 210 + b, a * 300 + b, a * 320 + b])

for i in np.arange(lines.num_rows):
    line = lines.row(i)
    plots.plot([210, 320], [line.item('at 210'), line.item('at 320')], lw=1)





Return to Slides




Inference for the True Slope (Demo Slide 18)¶

In [31]:
births.scatter('Gestational Days', 'Birth Weight', fit_line=True)
In [32]:
slope(births, 'Gestational Days', 'Birth Weight')
Out[32]:
0.53784536766790358
In [33]:
def bootstrap_slope(t, x, y, repetitions=2500):
    """ 
    Makes a 95% confidence interval for the slope of the true line, 
    using linear regression on the data in t (column names x and y).
    Shows a histogram of the bootstrap samples and shows the interval
    in gold.
    """
    
    # Bootstrap the scatter, find the slope, collect
    slopes = make_array()
    for i in np.arange(repetitions):
        bootstrap_sample = t.sample()
        bootstrap_slope = slope(bootstrap_sample, x, y)
        slopes = np.append(slopes, bootstrap_slope)
    
    # Find the endpoints of the 95% confidence interval for the true slope
    left = percentile(2.5, slopes)
    right = percentile(97.5, slopes)
    round_left = round(left, 3)
    round_right = round(right, 3)
    
    # Slope of the regression line from the original sample
    observed_slope = slope(t, x, y)
    
    # Display results (no need to follow this code)
    Table().with_column('Bootstrap Slopes', slopes).hist(bins=20)
    plots.plot(make_array(left, right), make_array(0, 0), color='yellow', lw=8);
    print('Slope of regression line:', round(observed_slope, 3))
    print('Approximate 95%-confidence interval for the slope of the true line:')
    print(round_left, 'to', round_right)
In [34]:
bootstrap_slope(births, 'Gestational Days', 'Birth Weight')
Slope of regression line: 0.538
Approximate 95%-confidence interval for the slope of the true line:
0.456 to 0.614





Return to Slides




Rain on the Regression Parade (Demo Slide 19)¶

In [35]:
draw_and_compare(0, 10, 15)

Maternal Age and Birth Weight¶

In [36]:
births.scatter('Maternal Age', 'Birth Weight')
In [37]:
slope(births, 'Maternal Age', 'Birth Weight')
Out[37]:
0.095142237298344659
In [38]:
births.scatter('Maternal Age', 'Birth Weight', fit_line=True)




Hypothesis Test:¶

Null: Slope of true line is equal to 0.

Alternative: Slope of true line is not equal to 0.

In [39]:
bootstrap_slope(births, 'Maternal Age', 'Birth Weight')
Slope of regression line: 0.095
Approximate 95%-confidence interval for the slope of the true line:
-0.091 to 0.283
In [ ]: