from datascience import *
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')


# Predefined functions; they should look familiar to functions you've coded in assignments!
def standard_units(arr):
    return (arr - np.average(arr))/np.std(arr)

def correlation(t, x, y):
    x_standard = standard_units(t.column(x))
    y_standard = standard_units(t.column(y))
    return np.average(x_standard * y_standard)

def slope(t, x, y):
    r = correlation(t, x, y)
    y_sd = np.std(t.column(y))
    x_sd = np.std(t.column(x))
    return r * y_sd / x_sd

def intercept(t, x, y):
    x_mean = np.mean(t.column(x))
    y_mean = np.mean(t.column(y))
    return y_mean - slope(t, x, y)*x_mean

def fitted_values(t, x, y):
    """Return an array of the regression estimates at all the x values"""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a*t.column(x) + b

def residuals(t, x, y):
    """Return an array of all the residuals"""
    predictions = fitted_values(t, x, y)
    return t.column(y) - predictions


# Ignore this code; it produces plots for demonstrating the regression model
def draw_and_compare(true_slope, true_int, sample_size):
    x = np.random.normal(50, 5, sample_size)
    xlims = np.array([np.min(x), np.max(x)])
    errors = np.random.normal(0, 6, sample_size)
    y = (true_slope * x + true_int) + errors
    sample = Table().with_columns('x', x, 'y', y)

    sample.scatter('x', 'y')
    plots.plot(xlims, true_slope*xlims + true_int, lw=2, color='green')
    plots.title('True Line, and Points Created')

    sample.scatter('x', 'y')
    plots.title('What We Get to See')

    sample.scatter('x', 'y', fit_line=True)
    plots.title('Regression Line: Estimate of True Line')

    sample.scatter('x', 'y', fit_line=True)
    plots.plot(xlims, true_slope*xlims + true_int, lw=2, color='green')
    plots.title("Regression Line and True Line")


draw_and_compare(2, -5, 10)


draw_and_compare(2, -5, 10)


draw_and_compare(2, -5, 100)


draw_and_compare(2, -5, 100)


births = Table.read_table('baby.csv')
births.show(3)


# Preterm and postterm pregnancy cutoffs, according to the CDC
37 * 7, 42 * 7

(259, 294)


births.scatter('Gestational Days', 'Birth Weight')


births = births.where('Gestational Days', are.between(225, 325))


births.scatter('Gestational Days', 'Birth Weight')


correlation(births, 'Gestational Days', 'Birth Weight')

0.42295118452423991


births.scatter('Gestational Days', 'Birth Weight', fit_line=True)


def prediction_at(t, x, y, x_value):
    '''
    t - table
    x - label of x column
    y - label of y column
    x_value - the x value for which we want to predict y
    '''
    return slope(t, x, y) * x_value + intercept(t, x, y)


prediction_at_300 = prediction_at(births, 'Gestational Days', 'Birth Weight', 300)
prediction_at_300

130.80951674248769


x = 300
births.scatter('Gestational Days', 'Birth Weight', fit_line=True)
plots.plot([x, x], [40, prediction_at_300], color='red', lw=2);


# You don't need to understand the plotting code in this cell,
# but you should understand the figure that comes out.

plots.figure(figsize=(10, 11))
plots.subplot(3, 2, 1)
plots.scatter(births[1], births[0], s=10, color='darkblue')
plots.xlim([225, 325])
plots.title('Original sample')

for i in np.arange(1, 6, 1):
    plots.subplot(3,2,i+1)
    resampled = births.sample()
    plots.scatter(resampled.column('Gestational Days'), resampled.column('Birth Weight'), s=10, color='tab:green')
    plots.xlim([225, 325])
    plots.title('Bootstrap sample '+str(i))
plots.tight_layout()


for i in np.arange(4):
    resample = births.sample()
    predicted_y = prediction_at(resample, 'Gestational Days', 'Birth Weight', 300)
    print('Predicted y from bootstrap sample was', predicted_y)
    resample.scatter('Gestational Days', 'Birth Weight', fit_line=True)
    plots.scatter(300, predicted_y, color='gold', s=50, zorder=3);
    plots.plot([x, x], [40, predicted_y], color='red', lw=2);
    plots.plot([200, x], [predicted_y, predicted_y], color='red', lw=2);

Predicted y from bootstrap sample was 130.390184934
Predicted y from bootstrap sample was 129.185230185
Predicted y from bootstrap sample was 129.963277846
Predicted y from bootstrap sample was 131.997919642


lines = Table(['slope','intercept', 'at 210', 'at 300', 'at 320'])

for i in range(10):
    resample = births.sample()
    a = slope(resample, 'Gestational Days', 'Birth Weight')
    b = intercept(resample, 'Gestational Days', 'Birth Weight')
    lines.append([a, b, a * 210 + b, a * 300 + b, a * 320 + b])
lines


for i in np.arange(lines.num_rows):
    line = lines.row(i)
    plots.plot([210, 320], [line.item('at 210'), line.item('at 320')], lw=1)
    plots.scatter(300, line.item('at 300'), s=30, zorder=3)


np.mean(births.column('Gestational Days')), np.mean(births.column('Birth Weight'))

(279.11015490533561, 119.57401032702238)


lines = Table(['slope','intercept', 'at 291', 'at 300', 'at 309'])

for i in range(10):
    resample = births.sample()
    a = slope(resample, 'Gestational Days', 'Birth Weight')
    b = intercept(resample, 'Gestational Days', 'Birth Weight')
    lines.append([a, b, a * 291 + b, a * 300 + b, a * 309 + b])
lines


for i in np.arange(lines.num_rows):
    line = lines.row(i)
    plots.plot([291, 309], [line.item('at 291'), line.item('at 309')], lw=1)
    plots.scatter(300, line.item('at 300'), s=30, zorder=3)


def bootstrap_prediction(t, x, y, new_x, repetitions=2500):
    """ 
    Makes a 95% confidence interval for the height of the true line at new_x, 
    using linear regression on the data in t (column names x and y).
    Shows a histogram of the bootstrap samples and shows the interval
    in gold.
    """

    # Bootstrap the scatter, predict, collect
    predictions = make_array()
    for i in np.arange(repetitions):
        resample = t.sample()
        predicted_y = prediction_at(resample, x, y, new_x)
        predictions = np.append(predictions, predicted_y)

    # Find the ends of the approximate 95% prediction interval
    left = percentile(2.5, predictions)
    right = percentile(97.5, predictions)
    round_left = round(left, 3)
    round_right = round(right, 3)

    # Display results
    Table().with_column('Prediction', predictions).hist(bins=20)
    plots.xlabel('predictions at x='+str(new_x))
    plots.plot([left, right], [0, 0], color='yellow', lw=8);
    print('Approximate 95%-confidence interval for height of true line at x =', new_x)
    print(round_left, 'to', round_right, '( width =', round(right - left, 3), ')')


bootstrap_prediction(births, 'Gestational Days', 'Birth Weight', 300)

Approximate 95%-confidence interval for height of true line at x = 300
128.944 to 132.793 ( width = 3.849 )


x = 300
births.scatter('Gestational Days', 'Birth Weight', fit_line=True)
plots.plot([x, x], [40, prediction_at_300], color='blue', lw=2);
prediction_at_230 = prediction_at(births, 'Gestational Days', 'Birth Weight', 230)
plots.plot([230, 230], [40, prediction_at_230], color='orange', lw=2);
prediction_at_280 = prediction_at(births, 'Gestational Days', 'Birth Weight', 280)
plots.plot([280, 280], [40, prediction_at_280], color='black', lw=2);


bootstrap_prediction(births, 'Gestational Days', 'Birth Weight', 230)

Approximate 95%-confidence interval for height of true line at x = 230
89.205 to 97.263 ( width = 8.057 )


bootstrap_prediction(births, 'Gestational Days', 'Birth Weight', 280)

Approximate 95%-confidence interval for height of true line at x = 280
119.098 to 121.046 ( width = 1.948 )


# No need to follow the code in this cell; just understand the graph

lines = Table(['slope','intercept', 'at 210', 'at 300', 'at 320'])

for i in range(10):
    resample = births.sample()
    a = slope(resample, 'Gestational Days', 'Birth Weight')
    b = intercept(resample, 'Gestational Days', 'Birth Weight')
    lines.append([a, b, a * 210 + b, a * 300 + b, a * 320 + b])

for i in np.arange(lines.num_rows):
    line = lines.row(i)
    plots.plot([210, 320], [line.item('at 210'), line.item('at 320')], lw=1)


births.scatter('Gestational Days', 'Birth Weight', fit_line=True)


slope(births, 'Gestational Days', 'Birth Weight')

0.53784536766790358


def bootstrap_slope(t, x, y, repetitions=2500):
    """ 
    Makes a 95% confidence interval for the slope of the true line, 
    using linear regression on the data in t (column names x and y).
    Shows a histogram of the bootstrap samples and shows the interval
    in gold.
    """
    
    # Bootstrap the scatter, find the slope, collect
    slopes = make_array()
    for i in np.arange(repetitions):
        bootstrap_sample = t.sample()
        bootstrap_slope = slope(bootstrap_sample, x, y)
        slopes = np.append(slopes, bootstrap_slope)
    
    # Find the endpoints of the 95% confidence interval for the true slope
    left = percentile(2.5, slopes)
    right = percentile(97.5, slopes)
    round_left = round(left, 3)
    round_right = round(right, 3)
    
    # Slope of the regression line from the original sample
    observed_slope = slope(t, x, y)
    
    # Display results (no need to follow this code)
    Table().with_column('Bootstrap Slopes', slopes).hist(bins=20)
    plots.plot(make_array(left, right), make_array(0, 0), color='yellow', lw=8);
    print('Slope of regression line:', round(observed_slope, 3))
    print('Approximate 95%-confidence interval for the slope of the true line:')
    print(round_left, 'to', round_right)


bootstrap_slope(births, 'Gestational Days', 'Birth Weight')

Slope of regression line: 0.538
Approximate 95%-confidence interval for the slope of the true line:
0.456 to 0.614


draw_and_compare(0, 10, 15)


births.scatter('Maternal Age', 'Birth Weight')


slope(births, 'Maternal Age', 'Birth Weight')

0.095142237298344659


births.scatter('Maternal Age', 'Birth Weight', fit_line=True)


bootstrap_slope(births, 'Maternal Age', 'Birth Weight')

Slope of regression line: 0.095
Approximate 95%-confidence interval for the slope of the true line:
-0.091 to 0.283

slope	intercept	at 210	at 300	at 320
0.541702	-31.4892	82.2683	131.021	141.856
0.482094	-14.8571	86.3826	129.771	139.413
0.533682	-29.0976	82.9757	131.007	141.681
0.531037	-28.4185	83.0994	130.893	141.513
0.491476	-17.7542	85.4558	129.689	139.518
0.489538	-16.5877	86.2154	130.274	140.065
0.552017	-35.138	80.7855	130.467	141.507
0.523219	-27.2175	82.6584	129.748	140.212
0.591405	-45.2561	78.939	132.165	143.994
0.496462	-19.307	84.95	129.632	139.561

slope	intercept	at 291	at 300	at 309
0.575427	-40.8788	126.571	131.749	136.928
0.545005	-33.0862	125.51	130.415	135.32
0.576423	-41.8477	125.891	131.079	136.267
0.521728	-26.4287	125.394	130.09	134.785
0.575251	-40.9346	126.464	131.641	136.818
0.547634	-33.5314	125.83	130.759	135.688
0.577011	-41.1714	126.739	131.932	137.125
0.529463	-28.938	125.136	129.901	134.666
0.500984	-20.3506	125.436	129.945	134.453
0.507768	-23.0621	124.698	129.268	133.838

Regression Model (Demo Slide 10)¶

Prediction (Demo Slide 11)¶

Suppose we assume the regression model¶

Prediction at a Given Value of x¶

Bootstrapping the Sample (Demo Slide 15)¶

Prediction Interval¶

Predictions at Different Values of x (Demo Slide 16)¶

Inference for the True Slope (Demo Slide 18)¶

Rain on the Regression Parade (Demo Slide 19)¶

Maternal Age and Birth Weight¶

Hypothesis Test:¶

Birth Weight	Gestational Days	Maternal Age	Maternal Height	Maternal Pregnancy Weight	Maternal Smoker
120	284	27	62	100	False
113	282	33	64	135	False
128	279	28	64	115	True