from datascience import *
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
In this lecture, I am going to use more interactive plots (they look better) so I am using the plotly.express library. We won't test you on this but it's good to know.
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
In this lecture, we derive the equation for linear regression using the correlation coefficient $r$.
In the previous lecture, we introduced the correlation coefficient:
\begin{align} r & = \text{Mean}\left(\text{StandardUnits}(x) * \text{StandardUnits}(y)\right)\\ & = \frac{1}{n} \sum_{i=1}^n \text{StandardUnits}(x_i) * \text{StandardUnits}(y_i)\\ & = \frac{1}{n}\sum_{i=1}^n \left( \frac{x_i - \text{Mean}(x)}{\text{Stdev}(x)} \right) * \left( \frac{y_i - \text{Mean}(y)}{\text{Stdev}(y)} \right) \\ \end{align}We implemented the correlation coefficient:
def standard_units(x):
"Convert any array of numbers to standard units."
return (x - np.average(x)) / np.std(x)
def correlation(t, x, y):
"""t is a table; x and y are column labels"""
x_in_su = standard_units(t.column(x))
y_in_su = standard_units(t.column(y))
return np.mean(x_in_su * y_in_su)
We built an intuition about the correlation coefficient using the following code which you don't need to understand:
def make_correlated_data(r, n=500):
"Generate a a table with columns x and y with a correlation of approximately r"
x = np.random.normal(0, 1, n)
z = np.random.normal(0, 1, n)
# This is "magic" to sample from a multivariate Gaussian
y = r*x + (np.sqrt(1-r**2))*z
return Table().with_columns("x", x, "y", y)
def r_scatter(r, n=500, subplot=False):
"Generate a scatter plot with a correlation approximately r"
data = make_correlated_data(r, n)
plt = go.Scatter(x=data.column("x"), y= data.column("y"),
name=str(r), mode="markers", marker_opacity=0.5)
if subplot:
return plt
else:
return go.Figure().add_trace(plt)
figs = make_subplots(2,3)
n=500
figs.add_trace(r_scatter(0.2, n, subplot=True), 1, 1)
figs.add_trace(r_scatter(0.5, n, subplot=True), 1, 2)
figs.add_trace(r_scatter(0.8, n, subplot=True), 1, 3)
figs.add_trace(r_scatter(-0.2, n, subplot=True), 2, 1)
figs.add_trace(r_scatter(-0.5, n, subplot=True), 2, 2)
figs.add_trace(r_scatter(-0.8, n, subplot=True), 2, 3)