from datascience import *
import numpy as np


%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
plots.rcParams["patch.force_edgecolor"] = True


top_movies = Table.read_table('top_movies_2017.csv')
top_movies


toy = Table().with_columns("Pets", make_array("Cat", "Dog", "Dog", "Bird", "Cat"))
toy


toy.group("Pets")


studio_counts = top_movies.select('Studio').group("Studio")
studio_counts


(
    studio_counts
        .sort("count", descending=True)
        .barh("Studio", "count")
)


count_col = studio_counts.column('count')
studio_counts = studio_counts.with_column("percent", count_col / count_col.sum() * 100 )
studio_counts


(
    studio_counts
    .sort("percent", descending=True)
    .barh("Studio", "percent")
)


top_movies.take(np.arange(5)) # just a preview


this_year = 2023
ages = this_year - top_movies.column('Year')
top_movies = top_movies.with_column('Age', ages)
top_movies


my_bins = make_array(0, 5, 10, 15, 25, 40, 65, 102)


binned_data = top_movies.bin('Age', bins = my_bins)
binned_data


top_movies.bin('Age', bins = np.arange(0, 126, 25))


top_movies.bin('Age', bins = 10)


# Let's make our first histogram!
top_movies.hist('Age', bins = my_bins, unit = 'Year')


# Let's try equally spaced bins instead.
top_movies.hist('Age', bins = np.arange(0, 110, 10), unit = 'Year')


top_movies.hist('Age', bins = 15, unit='Year')


# Let's try not specifying any bins!
top_movies.hist('Age', unit='Year')


top_movies.ihist('Age', unit='Year')


top_movies.ihist("Gross (Adjusted)", bins=20)


(
    top_movies
    .where("Studio", are.contained_in(make_array("Disney", "Buena Vista")))
    .ihist("Year", group="Studio", bins=20)
)


(
    top_movies
    .where("Studio", are.contained_in(make_array("Disney", "Buena Vista")))
    .ihist("Gross (Adjusted)", group="Studio", bins=20)
)


top_movies.hist('Age', bins=my_bins, unit='Year')
binned_data


# Add a column containing what percent of movies are in each bin
binned_data = binned_data.with_column(
    'Percent', 100 * binned_data.column('Age count') / binned_data.column('Age count').sum())
binned_data


# Step 1: Calculate % of movies in the [40, 65) bin
percent = binned_data.where('bin', 40).column('Percent').item(0)
percent

28.5


# Step 2: Calculate the width of the 40-65 bin
width = 65 - 40


# Step 3: Area of rectangle = height * width
#         --> height = percent / width
height = percent / width
height

1.14


binned_data


bi = (
    binned_data
    .take(np.arange(binned_data.num_rows - 1))
    .relabeled("bin", "Left")
)
bi


right_bins = (
    binned_data
    .take(np.arange(1, binned_data.num_rows))
    .column("bin")
)
right_bins

array([  5,  10,  15,  25,  40,  65, 102])


bi = bi.with_column("Right", right_bins)
bi


bi = bi.with_column("Width", bi.column("Right") - bi.column("Left"))
bi = bi.with_column("Height", bi.column("Percent") / bi.column("Width"))
bi

Title	Studio	Gross	Gross (Adjusted)	Year
Gone with the Wind	MGM	198676459	1796176700	1939
Star Wars	Fox	460998007	1583483200	1977
The Sound of Music	Fox	158671368	1266072700	1965
E.T.: The Extra-Terrestrial	Universal	435110554	1261085000	1982
Titanic	Paramount	658672302	1204368000	1997
The Ten Commandments	Paramount	65500000	1164590000	1956
Jaws	Universal	260000000	1138620700	1975
Doctor Zhivago	MGM	111721910	1103564200	1965
The Exorcist	Warner Brothers	232906145	983226600	1973
Snow White and the Seven Dwarves	Disney	184925486	969010000	1937

Studio	count
AVCO	1
Buena Vista	35
Columbia	9
Disney	11
Dreamworks	3
Fox	24
IFC	1
Lionsgate	3
MGM	7
Metro	1

Studio	count	percent
AVCO	1	0.5
Buena Vista	35	17.5
Columbia	9	4.5
Disney	11	5.5
Dreamworks	3	1.5
Fox	24	12
IFC	1	0.5
Lionsgate	3	1.5
MGM	7	3.5
Metro	1	0.5

Title	Studio	Gross	Gross (Adjusted)	Year
Gone with the Wind	MGM	198676459	1796176700	1939
Star Wars	Fox	460998007	1583483200	1977
The Sound of Music	Fox	158671368	1266072700	1965
E.T.: The Extra-Terrestrial	Universal	435110554	1261085000	1982
Titanic	Paramount	658672302	1204368000	1997

Title	Studio	Gross	Gross (Adjusted)	Year	Age
Gone with the Wind	MGM	198676459	1796176700	1939	84
Star Wars	Fox	460998007	1583483200	1977	46
The Sound of Music	Fox	158671368	1266072700	1965	58
E.T.: The Extra-Terrestrial	Universal	435110554	1261085000	1982	41
Titanic	Paramount	658672302	1204368000	1997	26
The Ten Commandments	Paramount	65500000	1164590000	1956	67
Jaws	Universal	260000000	1138620700	1975	48
Doctor Zhivago	MGM	111721910	1103564200	1965	58
The Exorcist	Warner Brothers	232906145	983226600	1973	50
Snow White and the Seven Dwarves	Disney	184925486	969010000	1937	86

Lecture 8¶

Categorical Distribution¶

Distributions of Numerical Data¶

Histograms¶

Exploring Data Distributions¶

Height¶

Left	Age count	Percent	Right
0	0	0	5
5	21	10.5	10
10	17	8.5	15
15	41	20.5	25
25	43	21.5	40
40	57	28.5	65
65	21	10.5	102

Left	Age count	Percent	Right	Width	Height
0	0	0	5	5	0
5	21	10.5	10	5	2.1
10	17	8.5	15	5	1.7
15	41	20.5	25	10	2.05
25	43	21.5	40	15	1.43333
40	57	28.5	65	25	1.14
65	21	10.5	102	37	0.283784

Pets
Cat
Dog
Dog
Bird
Cat

Pets	count
Bird	1
Cat	2
Dog	2

bin	Age count
6	41
15.6	40
25.2	28
34.8	24
44.4	25
54	19
63.6	13
73.2	6
82.8	3
92.4	1