from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')


families = Table.read_table('family_heights.csv')
families


families = families.with_column(
    "parent average", (families.column('father') + families.column('mother'))/2.0
)
families

families = families.with_column(
    "parent average", (families.column('father') + families.column('mother'))/2.0
)
families


families.scatter("parent average", "child")

families.scatter("parent average", "child")


my_height = 5*12 + 11 # 5 ft 11 inches
spouse_height = 5*12 + 7 # 5 ft 7 inches


our_average = (my_height + spouse_height) / 2.0
our_average

69.0


window = 1 
lower_bound = our_average - window
upper_bound = our_average + window


families.scatter('parent average', 'child')
# You don't need to know the details of this plotting code yet.
plots.plot([lower_bound, lower_bound], [50, 85], color='red', lw=2)
plots.plot([our_average, our_average], [50, 85], color='orange', lw=2);
plots.plot([upper_bound, upper_bound], [50, 85], color='red', lw=2);


def similar_child_heights(parent_average):
    lower_bound = parent_average - window
    upper_bound = parent_average + window
    return (
        families
            .where("parent average", are.between(lower_bound, upper_bound))
            .column("child")
    )

def similar_child_heights(parent_average):
    lower_bound = parent_average - window
    upper_bound = parent_average + window
    return (
        families
            .where("parent average", are.between(lower_bound, upper_bound))
            .column("child")
    )


# window = 1.0
similar_child_heights(our_average)

array([ 71. ,  68. ,  70.5,  68.5,  67. ,  64.5,  63. ,  65.5,  74. ,
        70. ,  68. ,  67. ,  67. ,  66. ,  63.5,  63. ,  71. ,  70.5,
        66.7,  72. ,  70.5,  70.2,  70.2,  69.2,  68.7,  66.5,  64.5,
        63.5,  74. ,  73. ,  71.5,  62.5,  66.5,  62.3,  66. ,  64.5,
        64. ,  62.7,  73. ,  71. ,  67. ,  74.2,  70.5,  69.5,  66. ,
        65.5,  65. ,  65. ,  65.5,  66. ,  63. ,  67.5,  67.2,  66.7,
        73.2,  73. ,  69. ,  67. ,  70. ,  67. ,  67. ,  66.5,  70. ,
        69. ,  68.5,  66. ,  64.5,  63. ,  71. ,  67. ,  76. ,  72. ,
        71. ,  66. ,  66. ,  70.5,  72. ,  72. ,  71. ,  69. ,  66. ,
        65. ,  73. ,  65.2,  68.5,  67.7,  68. ,  68. ,  62. ,  72. ,
        71. ,  70.5,  67. ,  72. ,  71. ,  70. ,  66. ,  64.5,  64.5,
        62. ,  71. ,  70. ,  69. ,  69. ,  70. ,  68.7,  68. ,  66. ,
        64. ,  62. ,  75. ,  70. ,  69. ,  66. ,  64. ,  60. ,  67.5,
        73. ,  72. ,  72. ,  66.5,  69.2,  67.2,  66.5,  66. ,  66. ,
        64.2,  63.7,  75. ,  71. ,  70. ,  66. ,  66. ,  65.5,  65. ,
        65. ,  64. ,  64. ,  64. ,  70.5,  67.5,  64.5,  64. ,  71. ,  61.7])


def predict_child_height(parent_average):
    return np.average(similar_child_heights(parent_average))

def predict_child_height(parent_average):
    return np.average(similar_child_heights(parent_average))


predict_child_height(our_average)

67.799310344827589


# window = 1.0
similar = similar_child_heights(our_average)
predicted_height = predict_child_height(our_average)

print("Mean:", predicted_height)
Table().with_column("child", similar).hist("child", bins=20)
plots.plot([predicted_height, predicted_height], [0, .1], color="red")

Mean: 67.7993103448

[<matplotlib.lines.Line2D at 0x14eebdea0>]


# window = 0.5
families = families.with_column(
    "predicted", families.apply(predict_child_height, "parent average"))
families

# window = 0.5
families = families.with_column(
    "predicted", families.apply(predict_child_height, "parent average"))
families


(
    families
    .select('parent average','child', 'predicted')
    .scatter('parent average')
)

(
    families
    .select('parent average','child', 'predicted')
    .scatter('parent average')
)


def error(predicted, true_value):
    return predicted - true_value

families = families.with_column(
    "error", families.apply(error, "predicted", "child"))
families

def error(predicted, true_value):
    return predicted - true_value

families = families.with_column(
    "error", families.apply(error, "predicted", "child"))
families


families.hist('error')


families.hist('error', group='sex')

families.hist('error', group='sex')

def similar_child_heights(parent_average):
    lower_bound = parent_average - window
    upper_bound = parent_average + window
    return np.average(
        families
            .where("parent average", are.between(lower_bound, upper_bound))
            .column("child")
    )


def predict_child_height_with_sex(parent_average, sex):
    lower_bound = parent_average - window
    upper_bound = parent_average + window
    return np.average(
        families
        .where("sex", sex)
        .where("parent average", are.between(lower_bound, upper_bound))
        .column("child")
    )

def predict_child_height_with_sex(parent_average, sex):
    lower_bound = parent_average - window
    upper_bound = parent_average + window
    return np.average(
        families
        .where("sex", sex)
        .where("parent average", are.between(lower_bound, upper_bound))
        .column("child")
    )


predict_child_height_with_sex(our_average, "male")

70.640298507462674


predict_child_height_with_sex(our_average, "female")

65.358974358974365


families = families.with_column(
    "predicted with sex", families.apply(predict_child_height_with_sex, "parent average", "sex"))
families

families = families.with_column(
    "predicted with sex", families.apply(predict_child_height_with_sex, "parent average", "sex"))
families


families = families.with_column("error with sex", 
                                families.apply(error, "predicted with sex", "child"))

families.hist("error with sex", group="sex")


families.hist("error", group="sex")


cones = Table.read_table('cones.csv')
cones


cones.group('Flavor')

cones.group('Flavor')


cones.group('Flavor', np.average)

cones.group('Flavor', np.average)


cones.group('Flavor', np.min)

cones.group('Flavor', np.min)


cones


def my_grp(grp):
    print(grp)
    return grp

cones.group("Flavor", my_grp)

['pink']
['light brown' 'dark brown' 'dark brown']
['pink' 'pink']
[ 4.75]
[ 4.75  5.25  5.25]
[ 3.55  5.25]
[1]
[4 3 5]
[1 2]

Flavor	count
bubblegum	1
chocolate	3
strawberry	2

Flavor	Price average	Rating average
bubblegum	4.75	1
chocolate	5.08333	4
strawberry	4.4	1.5

Flavor	Price amin	Rating amin
bubblegum	4.75	1
chocolate	4.75	3
strawberry	3.55	1

Lecture 10¶

Prediction¶

Exploring the Data¶

Making a Prediction¶

Evaluating the Predictions¶

Building a Better Predictor¶

Grouping¶

family	father	mother	child	children	order	sex
1	78.5	67	73.2	4	1	male
1	78.5	67	69.2	4	2	female
1	78.5	67	69	4	3	female
1	78.5	67	69	4	4	female
2	75.5	66.5	73.5	4	1	male
2	75.5	66.5	72.5	4	2	male
2	75.5	66.5	65.5	4	3	female
2	75.5	66.5	65.5	4	4	female
3	75	64	71	2	1	male
3	75	64	68	2	2	female

Flavor	Color	Price	Rating
strawberry	pink	3.55	1
chocolate	light brown	4.75	4
chocolate	dark brown	5.25	3
strawberry	pink	5.25	2
chocolate	dark brown	5.25	5
bubblegum	pink	4.75	1

Flavor	Color my_grp	Price my_grp	Rating my_grp
bubblegum	['pink']	[ 4.75]	[1]
chocolate	['light brown' 'dark brown' 'dark brown']	[ 4.75 5.25 5.25]	[4 3 5]
strawberry	['pink' 'pink']	[ 3.55 5.25]	[1 2]