import pandas as pd import numpy as np import matplotlib.pyplot as plt %matplotlib inline plt.style.use('ggplot') import dateutil.parser # Note to self for later: http://stackoverflow.com/a/17001474 # Data from https://www.census.gov/econ/currentdata/datasets/ data_df = pd.read_excel("RESSALES-mf.xlsx", sheetname='data') data_df.head() categories_df = pd.read_excel("RESSALES-mf.xlsx", sheetname='categories') data_types_df = pd.read_excel("RESSALES-mf.xlsx", sheetname='data_types') error_types_df = pd.read_excel("RESSALES-mf.xlsx", sheetname='error_types') geo_levels_df = pd.read_excel("RESSALES-mf.xlsx", sheetname='geo_levels') periods_df = pd.read_excel("RESSALES-mf.xlsx", sheetname='periods') categories_df.head(2) # it auto-merges cat_idx in our original dataframe with cat_idx in categories_df # it auto-merges dt_idx in our original dataframe with dt_idx in data_types_df # it auto-merges geo_idx in our original dataframe with geo_idx in geo_levels_df # it auto-merges per_idx in our original dataframe with per_idx in periods_df df = data_df.merge(categories_df).merge(data_types_df).merge(geo_levels_df).merge(periods_df) # We only want to look at the total number of homes sold across entire the united states df = df[(df['cat_code'] == 'SOLD') & (df['geo_code'] == 'US') & (df['dt_code'] == 'TOTAL')] # We don't merge error_types_df because all of the errors are the same df['et_idx'].value_counts() df.head(2) # Now let's remove the join columns to keep things clean df = df.drop(['per_idx', 'cat_idx', 'dt_idx', 'et_idx', 'geo_idx'], axis=1) df.head() # At least we can see 'per_name' (period name) is already a datetime! df.info()