In [1]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

Lecture 6¶

Table Review: Welcome Survey¶

In [2]:
welcome = Table.read_table('welcome_survey_sp23.csv')
welcome.show(5)
Year Extraversion Number of textees Hours of sleep Handedness Pant leg Sleep position Pets Piercings
Second Year 2 5 9 Right-handed Right leg in first On your right side Cat, Dog, Fish, Snake, Lizard -3
First Year 2 3 8 Right-handed I don't know On your back None -1
First Year 5 5 8 Right-handed Right leg in first On your left side Bearded dragon 0
Third Year 8 4 5 Right-handed I don't know On your back Bird 0
First Year 3 15 6 Left-handed Right leg in first On your back Bird 0

... (1496 rows omitted)

In [3]:
# On average, how long do side-sleepers sleep?
side_sleepers = welcome.where('Sleep position', are.containing('side'))
np.average(side_sleepers.column('Hours of sleep'))
Out[3]:
7.0674974039460023
In [4]:
# How many students get at least 8 hours of sleep each night (on average)?
# First way:
welcome.where('Hours of sleep', are.above_or_equal_to(8)).num_rows
Out[4]:
498
In [5]:
# Second way
np.count_nonzero(welcome.column('Hours of sleep') >= 8)
Out[5]:
498
In [6]:
# Third way
np.sum(welcome.column('Hours of sleep') >= 8)
Out[6]:
498
In [7]:
# Create a table with only the two sleep-related columns, 
# with names 'Hours' and 'Position'
two_col = welcome.select('Hours of sleep', 'Sleep position')
sleep = two_col.relabeled(0, 'Hours').relabeled(1, 'Position')
sleep.show(3)
Hours Position
9 On your right side
8 On your back
8 On your left side

... (1498 rows omitted)

In [8]:
# Second way
two_col = welcome.drop(
    'Handedness', 'Extraversion',
    'Pant leg', 'Number of textees',
    'Pets', 'Piercings',
    'Year', 'Extroversion')
sleep = two_col.relabeled(0, 'Hours').relabeled(1, 'Position')
sleep.show(3)
Hours Position
9 On your right side
8 On your back
8 On your left side

... (1498 rows omitted)

Discussion Question: NBA Salaries¶

In [9]:
nba = Table.read_table('nba_salaries.csv')
nba.show(3)
rank name position team salary season
1 Shaquille O'Neal C Los Angeles Lakers 17142000 2000
2 Kevin Garnett PF Minnesota Timberwolves 16806000 2000
3 Alonzo Mourning C Miami Heat 15004000 2000

... (9453 rows omitted)

In [10]:
# Question (a)
guards = nba.where('position', 'PG')
guards.where('salary', are.above(15)).column('name')
Out[10]:
array(['Gary Payton', 'Damon Stoudamire', 'Stephon Marbury', ...,
       'Brandon Goodwin', 'Reggie Jackson', 'Ky Bowman'],
      dtype='<U24')
In [11]:
# Question (b)
nba.drop('position')
nba.num_columns
Out[11]:
6

Census¶

In [12]:
full = Table.read_table('nc-est2019-agesex-res.csv')
full
Out[12]:
SEX AGE CENSUS2010POP ESTIMATESBASE2010 POPESTIMATE2010 POPESTIMATE2011 POPESTIMATE2012 POPESTIMATE2013 POPESTIMATE2014 POPESTIMATE2015 POPESTIMATE2016 POPESTIMATE2017 POPESTIMATE2018 POPESTIMATE2019
0 0 3944153 3944160 3951430 3963092 3926570 3931258 3954787 3983981 3954773 3893990 3815343 3783052
0 1 3978070 3978090 3957730 3966225 3977549 3942698 3948891 3973133 4002903 3972711 3908830 3829599
0 2 4096929 4096939 4090621 3970654 3978925 3991740 3958711 3966321 3991349 4020045 3987032 3922044
0 3 4119040 4119051 4111688 4101644 3981531 3991017 4005928 3974351 3982984 4006946 4033038 3998665
0 4 4063170 4063186 4077346 4121488 4111490 3992502 4004032 4020292 3989750 3997280 4018719 4043323
0 5 4056858 4056872 4064521 4087054 4131049 4121876 4004576 4017589 4035033 4003452 4008443 4028281
0 6 4066381 4066412 4072904 4074531 4096631 4141126 4133372 4017388 4031568 4048018 4014057 4017227
0 7 4030579 4030594 4042990 4082821 4084175 4106756 4152666 4145872 4030888 4044139 4058370 4022319
0 8 4046486 4046497 4025501 4052773 4092559 4094513 4118349 4165033 4158848 4042924 4054236 4066194
0 9 4148353 4148369 4125312 4035319 4062726 4103052 4106068 4130887 4177895 4170813 4053179 4061874

... (296 rows omitted)

In [13]:
partial = full.select('SEX', 'AGE', 'POPESTIMATE2014', 'POPESTIMATE2019')
partial.show(5)
SEX AGE POPESTIMATE2014 POPESTIMATE2019
0 0 3954787 3783052
0 1 3948891 3829599
0 2 3958711 3922044
0 3 4005928 3998665
0 4 4004032 4043323

... (301 rows omitted)

In [14]:
us_pop = partial.relabeled(2, '2014').relabeled(3, '2019')
us_pop.show(5)
SEX AGE 2014 2019
0 0 3954787 3783052
0 1 3948891 3829599
0 2 3958711 3922044
0 3 4005928 3998665
0 4 4004032 4043323

... (301 rows omitted)

In [15]:
us_pop.where('AGE', are.above_or_equal_to(100)).sort('AGE')
Out[15]:
SEX AGE 2014 2019
0 100 70685 100322
1 100 13577 23472
2 100 57108 76850
0 999 318301008 328239523
1 999 156654424 161657324
2 999 161646584 166582199

2019 Sex Ratios¶

In [16]:
us_pop_2019 = us_pop.drop('2014')
us_pop_2019.show(3)
SEX AGE 2019
0 0 3783052
0 1 3829599
0 2 3922044

... (303 rows omitted)

In [17]:
all_ages = us_pop_2019.where('AGE', are.equal_to(999))
all_ages
Out[17]:
SEX AGE 2019
0 999 328239523
1 999 161657324
2 999 166582199
In [18]:
infants = us_pop_2019.where('AGE', are.equal_to(0))
infants
Out[18]:
SEX AGE 2019
0 0 3783052
1 0 1935117
2 0 1847935
In [19]:
females_all_rows = us_pop_2019.where('SEX', are.equal_to(2))
females = females_all_rows.where('AGE', are.not_equal_to(999))
females.show(3)
SEX AGE 2019
2 0 1847935
2 1 1871014
2 2 1916500

... (98 rows omitted)

In [20]:
males_all_rows = us_pop_2019.where('SEX', are.equal_to(1))
males = males_all_rows.where('AGE', are.not_equal_to(999))
males.show(3)
SEX AGE 2019
1 0 1935117
1 1 1958585
1 2 2005544

... (98 rows omitted)

In [21]:
f_to_m_ratios = females.column(2) / males.column(2)

ratios = Table().with_columns(
    'Age', females.column('AGE'),
    'F:M Ratio', f_to_m_ratios
)

ratios
Out[21]:
Age F:M Ratio
0 0.954947
1 0.955289
2 0.955601
3 0.957242
4 0.956177
5 0.954338
6 0.956801
7 0.956633
8 0.955161
9 0.959278

... (91 rows omitted)

In [22]:
ratios.sort('Age', descending=True)
Out[22]:
Age F:M Ratio
100 3.27411
99 2.91367
98 2.65926
97 2.5868
96 2.41969
95 2.34042
94 2.2271
93 2.10192
92 1.99252
91 1.90275

... (91 rows omitted)

Line Plot¶

In [23]:
ratios.plot('Age', 'F:M Ratio')