import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import hvplot.pandas
import plotly.express as px


population = pd.read_csv('data/population.csv', skiprows=4)


population


population.shape

(266, 66)


population.size

17556


population.columns

Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
       '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021'],
      dtype='object')


population.dtypes

Country Name       object
Country Code       object
Indicator Name     object
Indicator Code     object
1960              float64
                   ...   
2017              float64
2018              float64
2019              float64
2020              float64
2021              float64
Length: 66, dtype: object


population.drop(columns=['Indicator Name', 'Indicator Code'], inplace=True)


population.head()


gdp = (pd.read_csv('data/gdp.csv', skiprows=4)
         .drop(columns=['Indicator Name', 'Indicator Code']))


gdp.head()


population_long = population.melt(id_vars=['Country Name', 'Country Code'],
                                  var_name='year',
                                  value_name='population')


population_long


population_long.dtypes

Country Name     object
Country Code     object
year             object
population      float64
dtype: object


population_long['year'] = population_long.year.astype(int)


population_long.head()


population_long.dtypes

Country Name     object
Country Code     object
year              int64
population      float64
dtype: object


gdp_long = (gdp.melt(id_vars=['Country Name', 'Country Code'],
                    var_name='year',
                    value_name='gdp')
               .astype({'year': int}))


gdp_long


gdp_long.dtypes

Country Name     object
Country Code     object
year              int64
gdp             float64
dtype: object


data = population_long.merge(gdp_long.drop(columns='Country Name'),
                             on=['Country Code', 'year'],
                             how='inner')


data.head()


data['gdp_per_capita'] = data.gdp / data.population


data.head()


def read_world_bank_data(file_name, value_name):
    return (pd.read_csv(file_name, skiprows=4)
              .drop(columns=['Indicator Name', 'Indicator Code'])
              .melt(id_vars=['Country Name', 'Country Code'],
                    var_name='year',
                    value_name=value_name)
              .astype({'year': int}))


life_exp = read_world_bank_data(file_name='data/life-expectancy.csv',
                                value_name='life_exp')


life_exp.head()


data = data.merge(life_exp.drop(columns='Country Name'),
                  on=['Country Code', 'year'],
                  how='inner')


data.head()


m49 = pd.read_csv('data/m49.csv')


m49.head()


m49.columns

Index(['Global Code', 'Global Name', 'Region Code', 'Region Name',
       'Sub-region Code', 'Sub-region Name', 'Intermediate Region Code',
       'Intermediate Region Name', 'Country or Area', 'M49 Code',
       'ISO-alpha2 Code', 'ISO-alpha3 Code', 'Least Developed Countries (LDC)',
       'Land Locked Developing Countries (LLDC)',
       'Small Island Developing States (SIDS)'],
      dtype='object')


regions = m49[['Region Name', 'ISO-alpha3 Code']]


regions.head()


data = data.merge(regions,
                  left_on='Country Code',
                  right_on='ISO-alpha3 Code',
                  how='inner')


data.head()


data.drop(columns='ISO-alpha3 Code', inplace=True)


data.rename(columns={'Country Name': 'country_name',
                     'Country Code': 'country_code',
                     'Region Name': 'region_name'},
            inplace=True)


data.head()


data.country_code == 'USA'

0        False
1        False
2        False
3        False
4        False
         ...  
13325    False
13326    False
13327    False
13328    False
13329    False
Name: country_code, Length: 13330, dtype: bool


usa_data = data[data.country_code == 'USA']


usa_data.head()


usa_data.country_name.unique()

array(['United States'], dtype=object)


usa_data.country_name.unique()[0]

'United States'


plt.plot(usa_data.year, usa_data.population)
plt.show()


plt.plot('year', 'gdp', data=usa_data)
plt.show()


usa_data.plot(x='year', y='life_exp')
plt.show()


ax = data[data.country_code == 'USA'].plot(x='year',
                                           y='gdp_per_capita',
                                           color='blue',
                                           label='USA')

data[data.country_code == 'CAN'].plot(x='year',
                                      y='gdp_per_capita',
                                      color='red',
                                      label='Canada',
                                      ax=ax)

data[data.country_code == 'MEX'].plot(x='year',
                                      y='gdp_per_capita',
                                      color='green',
                                      label='Mexico',
                                      ax=ax)

plt.ylabel('GDP per capita')
plt.xlabel('Year')
plt.title('GDP per Capita Over Time for North American Countries')
plt.show()


fig, ax = plt.subplots(figsize=(7, 5))
ax.plot(usa_data.year, usa_data.gdp_per_capita, 'g--', label='GDP per Capita')
plt.ylabel('GDP per Capita', color='g',)
plt.xlabel('Year')
ax2 = ax.twinx()
ax2.plot(usa_data.year, usa_data.life_exp, 'mx', label='Life Expectancy')
plt.ylabel('Life Expectancy', color='m')
plt.title('United States', size=20)
fig.legend()
plt.show()


data2020 = data[data.year == 2020]


data2020.head()


plt.hist(data2020.gdp_per_capita)
plt.xlabel('GDP per Capita')
plt.show()


sns.histplot(data2020.life_exp, kde=True)
plt.show()


data2020.plot(x='gdp_per_capita', y='life_exp', kind='scatter')
plt.show()


sns.jointplot(data=data2020,
              x='gdp_per_capita',
              y='life_exp',
              kind='kde',
              fill=True)
plt.show()


plt.scatter(data2020.gdp_per_capita, data2020.life_exp)
plt.xscale('log')
plt.xlabel('GDP per Capita')
plt.ylabel('Life Expectancy')
plt.show()


fig, ax = plt.subplots(figsize=(7, 7))
ax.scatter(data2020.gdp_per_capita,
           data2020.life_exp,
           s=data2020.population/data2020.population.max()*5000,
           alpha=0.5)
plt.xscale('log')
plt.xlabel('GDP per Capita')
plt.ylabel('Life Expectancy')
plt.show()


fig, ax = plt.subplots(figsize=(7, 7))

for region in data2020.region_name.unique():
    
    subset = data2020[data2020.region_name == region]
    
    ax.scatter(subset.gdp_per_capita,
                subset.life_exp,
                s=subset.population/data2020.population.max()*5000,
                label=region,
                alpha=0.5)
    
plt.xscale('log')
plt.xlabel('GDP per Capita')
plt.ylabel('Life Expectancy')
plt.title('2020')
plt.show()


fig, ax = plt.subplots(figsize=(7, 7))
sns.scatterplot(data=data2020,
                x='gdp_per_capita',
                y='life_exp',
                hue='region_name',
                size='population',
                sizes=(10, 5000),
                alpha=0.5,
                legend=False,
                ax=ax)
plt.xscale('log')
plt.xlabel('GDP per Capita')
plt.ylabel('Life Expectancy')
plt.title('2020')
plt.show()


data2020.hvplot.scatter(x='gdp_per_capita',
                        y='life_exp',
                        c='region_name',
                        s='population',
                        scale=1/data2020.population.max()*2000000,
                        hover_cols=['country_name', 'country_code'],
                        alpha=0.5,
                        logx=True,
                        width=650,
                        height=500)


px.scatter(data_frame=data2020.dropna(),
           x='gdp_per_capita',
           y='life_exp',
           color='region_name',
           size='population',
           size_max=40,
           hover_name='country_name',
           hover_data=['country_code'],
           opacity=0.5,
           log_x=True,
           width=650,
           height=600)


mbta = pd.read_csv('data/mbta-gated-entries-2020.csv')


mbta.head()


mbta.dtypes

service_date     object
time_period      object
stop_id          object
station_name     object
route_or_line    object
gated_entries     int64
dtype: object


mbta['time_period'] = mbta.time_period.str.strip('()')


mbta.head()


mbta['timestamp'] = mbta.service_date + ' ' + mbta.time_period


mbta.head()


mbta.dtypes

service_date     object
time_period      object
stop_id          object
station_name     object
route_or_line    object
gated_entries     int64
timestamp        object
dtype: object


mbta['timestamp'] = pd.to_datetime(mbta.timestamp)


mbta.head()


mbta.dtypes

service_date             object
time_period              object
stop_id                  object
station_name             object
route_or_line            object
gated_entries             int64
timestamp        datetime64[ns]
dtype: object


mbta.drop(columns=['service_date', 'time_period'], inplace=True)


mbta.head()


mbta.gated_entries.sum()

50199157


mbta.gated_entries[mbta.station_name == 'Davis'].sum()

996012


mbta.gated_entries[mbta.route_or_line == 'Red Line'].sum()

18947501


mbta.gated_entries[mbta.timestamp == '2020-02-24'].sum()

979


mbta.gated_entries[
    (mbta.timestamp >= '2020-02-01') & (mbta.timestamp < '2020-03-01')].sum()

10776306


mbta.gated_entries[mbta.timestamp.dt.month == 2].sum()

10776306


mbta['date'] = mbta.timestamp.dt.date


mbta.head()


mbta.groupby('date')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f586afe0750>


mbta.groupby('date').gated_entries

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f586afe0250>


mbta.groupby('date').gated_entries.sum()

date
2020-01-01    131374
2020-01-02    398109
2020-01-03    402018
2020-01-04    196810
2020-01-05    152047
               ...  
2020-12-27     50923
2020-12-28     94764
2020-12-29     96679
2020-12-30     96727
2020-12-31     54861
Name: gated_entries, Length: 366, dtype: int64


mbta.groupby('date').gated_entries.sum().to_frame('total_entries').head()


mbta_daily_sum = (mbta.groupby('date')
                      .gated_entries.sum()
                      .to_frame('total_entries')
                      .reset_index())


mbta_daily_sum.head()


mbta_daily_sum.total_entries.max()

495770


mbta_daily_sum.date[
    mbta_daily_sum.total_entries == mbta_daily_sum.total_entries.max()]

42    2020-02-12
Name: date, dtype: object


mbta_daily_sum.date[
    mbta_daily_sum.total_entries == mbta_daily_sum.total_entries.max()
].values[0]

datetime.date(2020, 2, 12)


mbta_daily_sum.total_entries.argmax()

42


mbta_daily_sum.loc[mbta_daily_sum.total_entries.argmax()]

date             2020-02-12
total_entries        495770
Name: 42, dtype: object


mbta_daily_sum.loc[mbta_daily_sum.total_entries.argmax(), 'date']

datetime.date(2020, 2, 12)


mbta_daily_sum.sort_values('total_entries')


(mbta.groupby('station_name')
     .gated_entries.sum()
     .sort_values(ascending=False)
     .to_frame('total_entries')
     .reset_index())


(mbta.groupby('route_or_line')
     .gated_entries.sum()
     .sort_values(ascending=False)
     .to_frame('total_entries')
     .reset_index())


mbta_daily = (mbta.groupby(['date', 'station_name', 'route_or_line'])
                   .gated_entries.sum()
                   .to_frame()
                   .reset_index())


mbta_daily.head()


fig, ax = plt.subplots(figsize=(7, 5))
mbta_daily[(mbta_daily.station_name == 'Harvard')].plot(x='date',
                                                        y='gated_entries',
                                                        legend=False,
                                                        color='crimson',
                                                        ax=ax)
plt.xlabel('Date')
plt.ylabel('Gated Entries')
plt.title('2020 Daily Gated Entires at the Harvard Square MBTA Station')
plt.show()

	Country Name	Country Code	Indicator Name	Indicator Code	1960	1961	1962	1963	1964	1965	...	2012	2013	2014	2015	2016	2017	2018	2019	2020	2021
0	Aruba	ABW	Population, total	SP.POP.TOTL	54608.0	55811.0	56682.0	57475.0	58178.0	58782.0	...	102112.0	102880.0	103594.0	104257.0	104874.0	105439.0	105962.0	106442.0	106585.0	106537.0
1	Africa Eastern and Southern	AFE	Population, total	SP.POP.TOTL	130692579.0	134169237.0	137835590.0	141630546.0	145605995.0	149742351.0	...	552530654.0	567891875.0	583650827.0	600008150.0	616377331.0	632746296.0	649756874.0	667242712.0	685112705.0	702976832.0
2	Afghanistan	AFG	Population, total	SP.POP.TOTL	8622466.0	8790140.0	8969047.0	9157465.0	9355514.0	9565147.0	...	30466479.0	31541209.0	32716210.0	33753499.0	34636207.0	35643418.0	36686784.0	37769499.0	38972230.0	40099462.0
3	Africa Western and Central	AFW	Population, total	SP.POP.TOTL	97256290.0	99314028.0	101445032.0	103667517.0	105959979.0	108336203.0	...	376797999.0	387204553.0	397855507.0	408690375.0	419778384.0	431138704.0	442646825.0	454306063.0	466189102.0	478185907.0
4	Angola	AGO	Population, total	SP.POP.TOTL	5357195.0	5441333.0	5521400.0	5599827.0	5673199.0	5736582.0	...	25188292.0	26147002.0	27128337.0	28127721.0	29154746.0	30208628.0	31273533.0	32353588.0	33428486.0	34503774.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
261	Kosovo	XKX	Population, total	SP.POP.TOTL	947000.0	966000.0	994000.0	1022000.0	1050000.0	1078000.0	...	1807106.0	1818117.0	1812771.0	1788196.0	1777557.0	1791003.0	1797085.0	1788878.0	1790133.0	1786038.0
262	Yemen, Rep.	YEM	Population, total	SP.POP.TOTL	5542459.0	5646668.0	5753386.0	5860197.0	5973803.0	6097298.0	...	26223391.0	26984002.0	27753304.0	28516545.0	29274002.0	30034389.0	30790513.0	31546691.0	32284046.0	32981641.0
263	South Africa	ZAF	Population, total	SP.POP.TOTL	16520441.0	16989464.0	17503133.0	18042215.0	18603097.0	19187194.0	...	53145033.0	53873616.0	54729551.0	55876504.0	56422274.0	56641209.0	57339635.0	58087055.0	58801927.0	59392255.0
264	Zambia	ZMB	Population, total	SP.POP.TOTL	3119430.0	3219451.0	3323427.0	3431381.0	3542764.0	3658024.0	...	14744658.0	15234976.0	15737793.0	16248230.0	16767761.0	17298054.0	17835893.0	18380477.0	18927715.0	19473125.0
265	Zimbabwe	ZWE	Population, total	SP.POP.TOTL	3806310.0	3925952.0	4049778.0	4177931.0	4310332.0	4447149.0	...	13265331.0	13555422.0	13855753.0	14154937.0	14452704.0	14751101.0	15052184.0	15354608.0	15669666.0	15993524.0

	Country Name	Country Code	1960	1961	1962	1963	1964	1965	1966	1967	...	2012	2013	2014	2015	2016	2017	2018	2019	2020	2021
0	Aruba	ABW	54608.0	55811.0	56682.0	57475.0	58178.0	58782.0	59291.0	59522.0	...	102112.0	102880.0	103594.0	104257.0	104874.0	105439.0	105962.0	106442.0	106585.0	106537.0
1	Africa Eastern and Southern	AFE	130692579.0	134169237.0	137835590.0	141630546.0	145605995.0	149742351.0	153955516.0	158313235.0	...	552530654.0	567891875.0	583650827.0	600008150.0	616377331.0	632746296.0	649756874.0	667242712.0	685112705.0	702976832.0
2	Afghanistan	AFG	8622466.0	8790140.0	8969047.0	9157465.0	9355514.0	9565147.0	9783147.0	10010030.0	...	30466479.0	31541209.0	32716210.0	33753499.0	34636207.0	35643418.0	36686784.0	37769499.0	38972230.0	40099462.0
3	Africa Western and Central	AFW	97256290.0	99314028.0	101445032.0	103667517.0	105959979.0	108336203.0	110798486.0	113319950.0	...	376797999.0	387204553.0	397855507.0	408690375.0	419778384.0	431138704.0	442646825.0	454306063.0	466189102.0	478185907.0
4	Angola	AGO	5357195.0	5441333.0	5521400.0	5599827.0	5673199.0	5736582.0	5787044.0	5827503.0	...	25188292.0	26147002.0	27128337.0	28127721.0	29154746.0	30208628.0	31273533.0	32353588.0	33428486.0	34503774.0

	Country Name	Country Code	1960	1961	1962	1963	1964	1965	1966	1967	...	2012	2013	2014	2015	2016	2017	2018	2019	2020	2021
0	Aruba	ABW	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	2.615084e+09	2.727933e+09	2.791061e+09	2.963128e+09	2.983799e+09	3.092179e+09	3.202235e+09	3.368970e+09	2.610039e+09	3.126019e+09
1	Africa Eastern and Southern	AFE	2.129081e+10	2.180870e+10	2.370727e+10	2.821034e+10	2.611906e+10	2.968249e+10	3.223946e+10	3.351491e+10	...	9.725734e+11	9.834729e+11	1.003768e+12	9.245228e+11	8.827213e+11	1.021119e+12	1.007240e+12	1.001017e+12	9.274845e+11	1.080712e+12
2	Afghanistan	AFG	5.377778e+08	5.488889e+08	5.466667e+08	7.511112e+08	8.000000e+08	1.006667e+09	1.400000e+09	1.673333e+09	...	2.020357e+10	2.056449e+10	2.055058e+10	1.999816e+10	1.801956e+10	1.889635e+10	1.841885e+10	1.890449e+10	2.014344e+10	1.478686e+10
3	Africa Western and Central	AFW	1.040414e+10	1.112789e+10	1.194319e+10	1.267633e+10	1.383837e+10	1.486223e+10	1.583259e+10	1.442604e+10	...	7.360399e+11	8.322169e+11	8.924979e+11	7.669580e+11	6.905454e+11	6.837480e+11	7.663597e+11	7.947191e+11	7.847997e+11	8.401873e+11
4	Angola	AGO	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	1.249982e+11	1.334016e+11	1.372444e+11	8.721930e+10	4.984049e+10	6.897277e+10	7.779294e+10	6.930911e+10	5.361907e+10	6.740429e+10

	Country Name	Country Code	year	population
0	Aruba	ABW	1960	54608.0
1	Africa Eastern and Southern	AFE	1960	130692579.0
2	Afghanistan	AFG	1960	8622466.0
3	Africa Western and Central	AFW	1960	97256290.0
4	Angola	AGO	1960	5357195.0
...	...	...	...	...
16487	Kosovo	XKX	2021	1786038.0
16488	Yemen, Rep.	YEM	2021	32981641.0
16489	South Africa	ZAF	2021	59392255.0
16490	Zambia	ZMB	2021	19473125.0
16491	Zimbabwe	ZWE	2021	15993524.0

	Country Name	Country Code	year	population
0	Aruba	ABW	1960	54608.0
1	Africa Eastern and Southern	AFE	1960	130692579.0
2	Afghanistan	AFG	1960	8622466.0
3	Africa Western and Central	AFW	1960	97256290.0
4	Angola	AGO	1960	5357195.0

UEP-0239: Python for Data Analysis and Visualization¶

Importing Packages¶

Getting Started with Pandas¶

Long vs Wide Data¶

Joining Datasets¶

Boolean Indexing¶

Creating Static Line Graphs¶

Visualizing Distributions and Correlations¶

Creating Interactive Visualizations¶

Working with Time Series¶

Simple Aggregations¶

Aggregating by Group¶

Additional Resources¶

	Country Name	Country Code	year	life_exp
0	Aruba	ABW	1960	64.152000
1	Africa Eastern and Southern	AFE	1960	44.085552
2	Afghanistan	AFG	1960	32.535000
3	Africa Western and Central	AFW	1960	37.845152
4	Angola	AGO	1960	38.211000

	Global Code	Global Name	Region Code	Region Name	Sub-region Code	Sub-region Name	Intermediate Region Code	Intermediate Region Name	Country or Area	M49 Code	ISO-alpha2 Code	ISO-alpha3 Code	Least Developed Countries (LDC)	Land Locked Developing Countries (LLDC)	Small Island Developing States (SIDS)
0	1	World	2.0	Africa	15.0	Northern Africa	NaN	NaN	Algeria	12	DZ	DZA	NaN	NaN	NaN
1	1	World	2.0	Africa	15.0	Northern Africa	NaN	NaN	Egypt	818	EG	EGY	NaN	NaN	NaN
2	1	World	2.0	Africa	15.0	Northern Africa	NaN	NaN	Libya	434	LY	LBY	NaN	NaN	NaN
3	1	World	2.0	Africa	15.0	Northern Africa	NaN	NaN	Morocco	504	MA	MAR	NaN	NaN	NaN
4	1	World	2.0	Africa	15.0	Northern Africa	NaN	NaN	Sudan	729	SD	SDN	x	NaN	NaN

	country_name	country_code	year	population	gdp	gdp_per_capita	life_exp	region_name
12524	United States	USA	1960	180671000.0	5.433000e+11	3007.123445	69.770732	Americas
12525	United States	USA	1961	183691000.0	5.633000e+11	3066.562869	70.270732	Americas
12526	United States	USA	1962	186538000.0	6.051000e+11	3243.843078	70.119512	Americas
12527	United States	USA	1963	189242000.0	6.386000e+11	3374.515171	69.917073	Americas
12528	United States	USA	1964	191889000.0	6.858000e+11	3573.941185	70.165854	Americas

	service_date	time_period	stop_id	station_name	route_or_line	gated_entries
0	2020-01-01	(00:00:00)	place-alfcl	Alewife	Red Line	3
1	2020-01-01	(00:00:00)	place-andrw	Andrew	Red Line	8
2	2020-01-01	(00:00:00)	place-aport	Airport	Blue Line	32
3	2020-01-01	(00:00:00)	place-aqucl	Aquarium	Blue Line	15
4	2020-01-01	(00:00:00)	place-armnl	Arlington	Green Line	3

	service_date	time_period	stop_id	station_name	route_or_line	gated_entries
0	2020-01-01	00:00:00	place-alfcl	Alewife	Red Line	3
1	2020-01-01	00:00:00	place-andrw	Andrew	Red Line	8
2	2020-01-01	00:00:00	place-aport	Airport	Blue Line	32
3	2020-01-01	00:00:00	place-aqucl	Aquarium	Blue Line	15
4	2020-01-01	00:00:00	place-armnl	Arlington	Green Line	3

	total_entries
date
2020-01-01	131374
2020-01-02	398109
2020-01-03	402018
2020-01-04	196810
2020-01-05	152047

	date	total_entries
102	2020-04-12	17976
116	2020-04-26	18506
109	2020-04-19	20025
95	2020-04-05	20240
359	2020-12-25	20680
...	...	...
29	2020-01-30	490601
14	2020-01-15	491462
35	2020-02-05	492955
57	2020-02-27	493547
42	2020-02-12	495770

	station_name	total_entries
0	Downtown Crossing	2289292
1	South Station	2017283
2	North Station	1720378
3	Harvard	1690353
4	Maverick	1661287
...	...	...
59	Symphony	182537
60	Riverside	137870
61	Suffolk Downs	112710
62	World Trade Center	91707
63	Science Park	38237

	route_or_line	total_entries
0	Red Line	18947501
1	Orange Line	16121128
2	Green Line	7493791
3	Blue Line	6626063
4	Silver Line	1010674