import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import dateutil.parser

Processing time with `pandas`

We’ve touched a little bit on time so far - mostly how tragic it is to parse - but pandas can do some neat things with it once you figure out how it works.

Let’s open up some data from the Census bureau - we’re going to use New Home Sales. The data is formatted… oddly, so I’ve done the importing and joining for you below.

data_df = pd.read_excel("RESSALES-mf.xlsx", sheetname='data')
data_df.head()

	per_idx	cat_idx	dt_idx	geo_idx	val
0	1	1	5	1	17200.0
1	1	1	2	1	9.0
2	1	1	3	1	11.0
3	1	1	4	1	22.0
4	1	1	1	1	42.0

categories_df = pd.read_excel("RESSALES-mf.xlsx", sheetname='categories')
data_types_df = pd.read_excel("RESSALES-mf.xlsx", sheetname='data_types')
error_types_df = pd.read_excel("RESSALES-mf.xlsx", sheetname='error_types')
geo_levels_df = pd.read_excel("RESSALES-mf.xlsx", sheetname='geo_levels')
periods_df = pd.read_excel("RESSALES-mf.xlsx", sheetname='periods')

categories_df.head(2)

	cat_idx	cat_code	cat_desc	cat_indent
0	1	SOLD	New Single-family Houses Sold	0
1	2	ASOLD	Annual Rate for New Single-family Houses Sold	0

# it auto-merges cat_idx in our original dataframe with cat_idx in categories_df
# it auto-merges dt_idx in our original dataframe with dt_idx in data_types_df
# it auto-merges geo_idx in our original dataframe with geo_idx in geo_levels_df
# it auto-merges per_idx in our original dataframe with per_idx in periods_df
df = data_df.merge(categories_df).merge(data_types_df).merge(geo_levels_df).merge(periods_df)
# We only want to look at the total number of homes sold across entire the united states
df = df[(df['cat_code'] == 'SOLD') & (df['geo_code'] == 'US') & (df['dt_code'] == 'TOTAL')]

# We don't merge error_types_df because all of the errors are the same
df['et_idx'].value_counts()

0    641
Name: et_idx, dtype: int64

df.head(2)

	per_idx	cat_idx	dt_idx	et_idx	geo_idx	is_adj	val	cat_code	cat_desc	cat_indent	dt_code	dt_desc	dt_unit	geo_code	geo_desc	per_name
4	1	1	1	0	1	0	42.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States	1963-01-01
14	2	1	1	0	1	0	35.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States	1963-02-01

# Now let's remove the join columns to keep things clean
df = df.drop(['per_idx', 'cat_idx', 'dt_idx', 'et_idx', 'geo_idx'], axis=1)
df.head()

	val	cat_code	cat_desc	dt_code	dt_desc	dt_unit	geo_code	geo_desc	per_name
4	42.0	SOLD	New Single-family Houses Sold	TOTAL	All Houses	K	US	United States	1963-01-01
14	35.0	SOLD	New Single-family Houses Sold	TOTAL	All Houses	K	US	United States	1963-02-01
24	44.0	SOLD	New Single-family Houses Sold	TOTAL	All Houses	K	US	United States	1963-03-01
34	52.0	SOLD	New Single-family Houses Sold	TOTAL	All Houses	K	US	United States	1963-04-01
44	58.0	SOLD	New Single-family Houses Sold	TOTAL	All Houses	K	US	United States	1963-05-01

# At least we can see 'per_name' (period name) is already a datetime!
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 641 entries, 4 to 15199
Data columns (total 11 columns):
is_adj        641 non-null int64
val           641 non-null float64
cat_code      641 non-null object
cat_desc      641 non-null object
cat_indent    641 non-null int64
dt_code       641 non-null object
dt_desc       641 non-null object
dt_unit       641 non-null object
geo_code      641 non-null object
geo_desc      641 non-null object
per_name      641 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2), object(7)
memory usage: 60.1+ KB

Changing the index to the datetime

Normally the index of the column is just a number.

df.head(3)

	val	cat_code	cat_desc	dt_code	dt_desc	dt_unit	geo_code	geo_desc	per_name
4	42.0	SOLD	New Single-family Houses Sold	TOTAL	All Houses	K	US	United States	1963-01-01
14	35.0	SOLD	New Single-family Houses Sold	TOTAL	All Houses	K	US	United States	1963-02-01
24	44.0	SOLD	New Single-family Houses Sold	TOTAL	All Houses	K	US	United States	1963-03-01

It’s the column on the far left - 0, 1, 2, 3, 4… boring and useless! If we replace the index with the datetime, though, we can start to have some fun

# First we move it over into the index column
df.index = df['per_name']
df.head(2)

	is_adj	val	cat_code	cat_desc	cat_indent	dt_code	dt_desc	dt_unit	geo_code	geo_desc	per_name
per_name
1963-01-01	0	42.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States	1963-01-01
1963-02-01	0	35.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States	1963-02-01

# Then we delete the per_name column because we don't need it any more...
del df['per_name']
df.head(2)

	is_adj	val	cat_code	cat_desc	cat_indent	dt_code	dt_desc	dt_unit	geo_code	geo_desc
per_name
1963-01-01	0	42.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1963-02-01	0	35.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States

Selecting specific(-ish) dates via the index

Now that our index is a datetime, we can select date ranges much more easily.

# Everything in March, 1963
df['1963-3']

	is_adj	val	cat_code	cat_desc	cat_indent	dt_code	dt_desc	dt_unit	geo_code	geo_desc
per_name
1963-03-01	0	44.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States

# Everything in 2010
df['2010']

	is_adj	val	cat_code	cat_desc	cat_indent	dt_code	dt_desc	dt_unit	geo_code	geo_desc
per_name
2010-01-01	0	24.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2010-02-01	0	27.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2010-03-01	0	36.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2010-04-01	0	41.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2010-05-01	0	26.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2010-06-01	0	28.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2010-07-01	0	26.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2010-08-01	0	23.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2010-09-01	0	25.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2010-10-01	0	23.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2010-11-01	0	20.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2010-12-01	0	23.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States

List slices with datetimes

We can also use list slicing with datetimes!

Just for review, you can use : to only select certain parts of a list:

# Make our list of fruits
ranked_fruits = ('banana', 'orange', 'apple', 'blueberries', 'strawberries')

# Start from the beginning, get the first two
ranked_fruits[:2]

('banana', 'orange')

# Start from two, get up until the fourth element
ranked_fruits[2:4]

('apple', 'blueberries')

# Starting from the third element, get all the rest
ranked_fruits[3:]

('blueberries', 'strawberries')

Instead of using boring ol’ numbers, we can use dates instead.

# Everything after 2001
df["2001":]

	is_adj	val	cat_code	cat_desc	cat_indent	dt_code	dt_desc	dt_unit	geo_code	geo_desc
per_name
2001-01-01	0	72.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2001-02-01	0	85.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2001-03-01	0	94.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2001-04-01	0	84.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2001-05-01	0	80.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2001-06-01	0	79.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2001-07-01	0	76.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2001-08-01	0	74.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2001-09-01	0	66.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2001-10-01	0	66.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2001-11-01	0	67.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2001-12-01	0	66.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2002-01-01	0	66.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2002-02-01	0	84.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2002-03-01	0	90.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2002-04-01	0	86.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2002-05-01	0	88.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2002-06-01	0	84.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2002-07-01	0	82.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2002-08-01	0	90.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2002-09-01	0	82.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2002-10-01	0	77.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2002-11-01	0	73.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2002-12-01	0	70.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2003-01-01	0	76.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2003-02-01	0	82.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2003-03-01	0	98.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2003-04-01	0	91.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2003-05-01	0	101.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2003-06-01	0	107.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
...	...	...	...	...	...	...	...	...	...	...
2013-12-01	0	31.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2014-01-01	0	33.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2014-02-01	0	35.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2014-03-01	0	39.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2014-04-01	0	39.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2014-05-01	0	43.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2014-06-01	0	38.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2014-07-01	0	35.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2014-08-01	0	36.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2014-09-01	0	37.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2014-10-01	0	38.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2014-11-01	0	31.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2014-12-01	0	35.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2015-01-01	0	39.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2015-02-01	0	45.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2015-03-01	0	46.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2015-04-01	0	48.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2015-05-01	0	47.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2015-06-01	0	44.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2015-07-01	0	43.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2015-08-01	0	41.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2015-09-01	0	35.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2015-10-01	0	39.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2015-11-01	0	36.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2015-12-01	0	38.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2016-01-01	0	39.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2016-02-01	0	45.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2016-03-01	0	49.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2016-04-01	0	57.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
2016-05-01	0	51.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States

185 rows × 10 columns

# Everything between June 1990 and March 1995
df["1990-06":"1995-03"]

	is_adj	val	cat_code	cat_desc	cat_indent	dt_code	dt_desc	dt_unit	geo_code	geo_desc
per_name
1990-06-01	0	50.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1990-07-01	0	46.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1990-08-01	0	46.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1990-09-01	0	38.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1990-10-01	0	37.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1990-11-01	0	34.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1990-12-01	0	29.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1991-01-01	0	30.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1991-02-01	0	40.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1991-03-01	0	51.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1991-04-01	0	50.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1991-05-01	0	47.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1991-06-01	0	47.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1991-07-01	0	43.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1991-08-01	0	46.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1991-09-01	0	37.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1991-10-01	0	41.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1991-11-01	0	39.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1991-12-01	0	36.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1992-01-01	0	48.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1992-02-01	0	55.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1992-03-01	0	56.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1992-04-01	0	53.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1992-05-01	0	52.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1992-06-01	0	53.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1992-07-01	0	52.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1992-08-01	0	56.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1992-09-01	0	51.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1992-10-01	0	48.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1992-11-01	0	42.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1992-12-01	0	42.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1993-01-01	0	44.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1993-02-01	0	50.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1993-03-01	0	60.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1993-04-01	0	66.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1993-05-01	0	58.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1993-06-01	0	59.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1993-07-01	0	55.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1993-08-01	0	57.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1993-09-01	0	57.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1993-10-01	0	56.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1993-11-01	0	53.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1993-12-01	0	51.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1994-01-01	0	46.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1994-02-01	0	58.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1994-03-01	0	74.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1994-04-01	0	65.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1994-05-01	0	65.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1994-06-01	0	55.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1994-07-01	0	52.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1994-08-01	0	59.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1994-09-01	0	54.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1994-10-01	0	57.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1994-11-01	0	45.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1994-12-01	0	40.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1995-01-01	0	47.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1995-02-01	0	47.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States
1995-03-01	0	60.0	SOLD	New Single-family Houses Sold	0	TOTAL	All Houses	K	US	United States

Info on our time series

If you try to .plot, pandas will automatically use the index (the date) as the x axis for you.

df.plot(y='val')

<matplotlib.axes._subplots.AxesSubplot at 0x1093255f8>

png

Hmmm, looks like something might have happened at some point. Maybe we want to see some numbers instead of a graph? To do aggregate statistics on time series in pandas we use a method called .resample(), and we’re going to tell it to group the data by year.

# http://stackoverflow.com/a/17001474 gives us a list of what we can pass to 'resample'
df.resample('A').median()

	cat_indent	is_adj	val
per_name
1963-12-31	0	0	46.0
1964-12-31	0	0	48.5
1965-12-31	0	0	48.5
1966-12-31	0	0	40.0
1967-12-31	0	0	43.5
1968-12-31	0	0	42.0
1969-12-31	0	0	39.5
1970-12-31	0	0	42.5
1971-12-31	0	0	55.0
1972-12-31	0	0	62.0
1973-12-31	0	0	54.5
1974-12-31	0	0	44.5
1975-12-31	0	0	46.0
1976-12-31	0	0	55.0
1977-12-31	0	0	69.5
1978-12-31	0	0	69.0
1979-12-31	0	0	61.5
1980-12-31	0	0	44.0
1981-12-31	0	0	36.5
1982-12-31	0	0	35.0
1983-12-31	0	0	50.5
1984-12-31	0	0	54.0
1985-12-31	0	0	57.5
1986-12-31	0	0	58.0
1987-12-31	0	0	55.5
1988-12-31	0	0	57.0
1989-12-31	0	0	55.0
1990-12-31	0	0	46.0
1991-12-31	0	0	42.0
1992-12-31	0	0	52.0
1993-12-31	0	0	56.5
1994-12-31	0	0	56.0
1995-12-31	0	0	56.0
1996-12-31	0	0	65.5
1997-12-31	0	0	69.0
1998-12-31	0	0	75.0
1999-12-31	0	0	77.0
2000-12-31	0	0	72.0
2001-12-31	0	0	75.0
2002-12-31	0	0	83.0
2003-12-31	0	0	90.5
2004-12-31	0	0	101.5
2005-12-31	0	0	109.5
2006-12-31	0	0	88.0
2007-12-31	0	0	67.0
2008-12-31	0	0	43.5
2009-12-31	0	0	31.5
2010-12-31	0	0	25.5
2011-12-31	0	0	25.0
2012-12-31	0	0	30.5
2013-12-31	0	0	34.5
2014-12-31	0	0	36.5
2015-12-31	0	0	42.0
2016-12-31	0	0	49.0

That still looks like too much data! What about every decade?

# If 'A' is every year, 10A is every 5 years
df.resample('5A').median()

	cat_indent	is_adj	val
per_name
1963-12-31	0	0	46.0
1968-12-31	0	0	44.0
1973-12-31	0	0	47.5
1978-12-31	0	0	55.0
1983-12-31	0	0	44.0
1988-12-31	0	0	56.5
1993-12-31	0	0	50.5
1998-12-31	0	0	64.0
2003-12-31	0	0	78.0
2008-12-31	0	0	83.5
2013-12-31	0	0	30.0
2018-12-31	0	0	39.0

# We can graph these!
df.plot(y='val', label="Monthly")
df.resample('A').median().plot(y='val', label="Annual")
df.resample('10A').median().plot(y='val', label="Decade")

<matplotlib.axes._subplots.AxesSubplot at 0x1090c76d8>

png

# We can graph these ALL ON THE SAME PLOT!
# we store the 'ax' from the first .plot and pass it to the others
ax = df.plot(y='val', label="Monthly")
df.resample('A').median().plot(y='val', ax=ax, label="Annual")
df.resample('10A').median().plot(y='val', ax=ax, label="Decade")

<matplotlib.axes._subplots.AxesSubplot at 0x109d38390>

png

# Which year had the worst month?
df.resample('A').median()

	cat_indent	is_adj	val
per_name
1963-12-31	0	0	46.0
1964-12-31	0	0	48.5
1965-12-31	0	0	48.5
1966-12-31	0	0	40.0
1967-12-31	0	0	43.5
1968-12-31	0	0	42.0
1969-12-31	0	0	39.5
1970-12-31	0	0	42.5
1971-12-31	0	0	55.0
1972-12-31	0	0	62.0
1973-12-31	0	0	54.5
1974-12-31	0	0	44.5
1975-12-31	0	0	46.0
1976-12-31	0	0	55.0
1977-12-31	0	0	69.5
1978-12-31	0	0	69.0
1979-12-31	0	0	61.5
1980-12-31	0	0	44.0
1981-12-31	0	0	36.5
1982-12-31	0	0	35.0
1983-12-31	0	0	50.5
1984-12-31	0	0	54.0
1985-12-31	0	0	57.5
1986-12-31	0	0	58.0
1987-12-31	0	0	55.5
1988-12-31	0	0	57.0
1989-12-31	0	0	55.0
1990-12-31	0	0	46.0
1991-12-31	0	0	42.0
1992-12-31	0	0	52.0
1993-12-31	0	0	56.5
1994-12-31	0	0	56.0
1995-12-31	0	0	56.0
1996-12-31	0	0	65.5
1997-12-31	0	0	69.0
1998-12-31	0	0	75.0
1999-12-31	0	0	77.0
2000-12-31	0	0	72.0
2001-12-31	0	0	75.0
2002-12-31	0	0	83.0
2003-12-31	0	0	90.5
2004-12-31	0	0	101.5
2005-12-31	0	0	109.5
2006-12-31	0	0	88.0
2007-12-31	0	0	67.0
2008-12-31	0	0	43.5
2009-12-31	0	0	31.5
2010-12-31	0	0	25.5
2011-12-31	0	0	25.0
2012-12-31	0	0	30.5
2013-12-31	0	0	34.5
2014-12-31	0	0	36.5
2015-12-31	0	0	42.0
2016-12-31	0	0	49.0

Cyclical data

It seems like winter might be a time where not very many houses are sold. Let’s see if that’s true!

# Group by the month, check the median
df.groupby(by=df.index.month).median()

	is_adj	val	cat_indent
1	0	44.5	0
2	0	50.0	0
3	0	58.0	0
4	0	59.5	0
5	0	58.0	0
6	0	58.0	0
7	0	55.0	0
8	0	56.0	0
9	0	50.0	0
10	0	51.0	0
11	0	43.0	0
12	0	39.0	0

# Group by the month, check the median, plot the results
df.groupby(by=df.index.month).median().plot(y='val')

<matplotlib.axes._subplots.AxesSubplot at 0x109e36828>

png

# Group by the month, check the median, plot the results
ax = df.groupby(by=df.index.month).median().plot(y='val', legend=False)
ax.set_xticks([1,2,3,4,5,6,7,8,9,10,11, 12])
ax.set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
ax.set_ylabel("Houses sold (in thousands)")
ax.set_title("House sales by month, 1963-2016")

<matplotlib.text.Text at 0x10a3e1d68>

png

More details

You can also use max and min and all of your other aggregate friends with .resample. For example, what’s the largest number of houses hold in a given year?

df.resample('A')['val'].max().plot()

<matplotlib.axes._subplots.AxesSubplot at 0x10953db00>

png

# The fewest?
df.resample('A')['val'].min().plot()

<matplotlib.axes._subplots.AxesSubplot at 0x10a2bfda0>

png

# We now know we can look at the range
ax = df.resample('A')['val'].median().plot()
df.resample('A')['val'].max().plot(ax=ax)
df.resample('A')['val'].min().plot(ax=ax)

<matplotlib.axes._subplots.AxesSubplot at 0x109df4d68>

png

# We now know we can look at the range IN AN EVEN COOLER WAY
ax = df.resample('A')['val'].median().plot()
x_values = df.resample('A').index
min_values = df.resample('A')['val'].min()
max_values = df.resample('A')['val'].max()
ax.fill_between(x_values, min_values, max_values, alpha=0.5)
ax.set_ylim([0,130])
ax.set_ylabel("Houses sold (in thousands)")
ax.set_xlabel("Year")
ax.set_title("The Housing Bubble")

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:3: FutureWarning: .resample() is now a deferred operation
use .resample(...).mean() instead of .resample(...)
  app.launch_new_instance()





<matplotlib.text.Text at 0x1093280b8>

png

Processing time with pandas