import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import dateutil.parser

First, I made a mistake naming the data set! It’s 2015 data, not 2014 data. But yes, still use 311-2014.csv. You can rename it.

Importing and preparing your data

Import your data, but only the first 200,000 rows. You’ll also want to change the index to be a datetime based on the Created Date column - you’ll want to check if it’s already a datetime, and parse it if not.

df = pd.read_csv("../311-2014.csv", nrows=200000)
/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (8,17,48) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 53 columns):
Unique Key                        200000 non-null int64
Created Date                      200000 non-null object
Closed Date                       188913 non-null object
Agency                            200000 non-null object
Agency Name                       200000 non-null object
Complaint Type                    200000 non-null object
Descriptor                        198197 non-null object
Location Type                     179328 non-null object
Incident Zip                      181049 non-null object
Incident Address                  152173 non-null object
Street Name                       152152 non-null object
Cross Street 1                    108035 non-null object
Cross Street 2                    107583 non-null object
Intersection Street 1             24790 non-null object
Intersection Street 2             24530 non-null object
Address Type                      177091 non-null object
City                              181095 non-null object
Landmark                          127 non-null object
Facility Type                     80031 non-null object
Status                            199998 non-null object
Due Date                          152018 non-null object
Resolution Description            198936 non-null object
Resolution Action Updated Date    188529 non-null object
Community Board                   200000 non-null object
Borough                           200000 non-null object
X Coordinate (State Plane)        175825 non-null float64
Y Coordinate (State Plane)        175825 non-null float64
Park Facility Name                200000 non-null object
Park Borough                      200000 non-null object
School Name                       200000 non-null object
School Number                     199907 non-null object
School Region                     197128 non-null object
School Code                       197128 non-null object
School Phone Number               200000 non-null object
School Address                    200000 non-null object
School City                       200000 non-null object
School State                      200000 non-null object
School Zip                        199999 non-null object
School Not Found                  151897 non-null object
School or Citywide Complaint      0 non-null float64
Vehicle Type                      34 non-null object
Taxi Company Borough              434 non-null object
Taxi Pick Up Location             3680 non-null object
Bridge Highway Name               1960 non-null object
Bridge Highway Direction          1959 non-null object
Road Ramp                         1946 non-null object
Bridge Highway Segment            2134 non-null object
Garage Lot Name                   143 non-null object
Ferry Direction                   86 non-null object
Ferry Terminal Name               215 non-null object
Latitude                          175825 non-null float64
Longitude                         175825 non-null float64
Location                          175825 non-null object
dtypes: float64(5), int64(1), object(47)
memory usage: 80.9+ MB
def parse_date(str_date):
    return dateutil.parser.parse(str_date)

df['created_dt'] = df['Created Date'].apply(parse_date)
df.head(2)
Unique Key Created Date Closed Date Agency Agency Name Complaint Type Descriptor Location Type Incident Zip Incident Address ... Bridge Highway Direction Road Ramp Bridge Highway Segment Garage Lot Name Ferry Direction Ferry Terminal Name Latitude Longitude Location created_dt
0 31015465 07/06/2015 10:58:27 AM 07/22/2015 01:07:20 AM DCA Department of Consumer Affairs Consumer Complaint Demand for Cash NaN 11360 27-16 203 STREET ... NaN NaN NaN NaN NaN NaN 40.773540 -73.788237 (40.773539552542, -73.78823697228408) 2015-07-06 10:58:27
1 30997660 07/03/2015 01:26:29 PM 07/03/2015 02:08:20 PM NYPD New York City Police Department Vending In Prohibited Area Residential Building/House 10019 200 CENTRAL PARK SOUTH ... NaN NaN NaN NaN NaN NaN 40.767021 -73.979448 (40.76702142171206, -73.97944780718524) 2015-07-03 13:26:29

2 rows × 54 columns

df.index = df['created_dt']
df.head(2)
Unique Key Created Date Closed Date Agency Agency Name Complaint Type Descriptor Location Type Incident Zip Incident Address ... Bridge Highway Direction Road Ramp Bridge Highway Segment Garage Lot Name Ferry Direction Ferry Terminal Name Latitude Longitude Location created_dt
created_dt
2015-07-06 10:58:27 31015465 07/06/2015 10:58:27 AM 07/22/2015 01:07:20 AM DCA Department of Consumer Affairs Consumer Complaint Demand for Cash NaN 11360 27-16 203 STREET ... NaN NaN NaN NaN NaN NaN 40.773540 -73.788237 (40.773539552542, -73.78823697228408) 2015-07-06 10:58:27
2015-07-03 13:26:29 30997660 07/03/2015 01:26:29 PM 07/03/2015 02:08:20 PM NYPD New York City Police Department Vending In Prohibited Area Residential Building/House 10019 200 CENTRAL PARK SOUTH ... NaN NaN NaN NaN NaN NaN 40.767021 -73.979448 (40.76702142171206, -73.97944780718524) 2015-07-03 13:26:29

2 rows × 54 columns

What was the most popular type of complaint, and how many times was it filed?

df['Complaint Type'].value_counts().sort_values(ascending=False).head(3)
Blocked Driveway    21779
Illegal Parking     19837
HEAT/HOT WATER      12408
Name: Complaint Type, dtype: int64

Make a horizontal bar graph of the top 5 most frequent complaint types.

df['Complaint Type'].value_counts().sort_values(ascending=False).head(5).plot(kind='barh')
<matplotlib.axes._subplots.AxesSubplot at 0x1092902b0>

png

Which borough has the most complaints per capita? Since it’s only 5 boroughs, you can do the math manually.

df['Borough'].value_counts()
BROOKLYN         57129
QUEENS           46824
MANHATTAN        42050
BRONX            29610
Unspecified      17000
STATEN ISLAND     7387
Name: Borough, dtype: int64
print("Brooklyn is", 57129 / 2592000, "per person")
print("Queens is", 46824 / 2296000, "per person")
print("Manhattan is", 42050 / 1626000, "per person")
print("Bronx is", 29610 / 1419000, "per person")
print("Staten Island is", 7387 / 472000, "per person")
# BROOKLYN!
Brooklyn is 0.02204050925925926 per person
Queens is 0.020393728222996517 per person
Manhattan is 0.0258610086100861 per person
Bronx is 0.020866807610993656 per person
Staten Island is 0.015650423728813558 per person

According to your selection of data, how many cases were filed in March? How about May?

len(df["2015-03"])
15025
len(df["2015-05"])
49715

I’d like to see all of the 311 complaints called in on April 1st.

Surprise! We couldn’t do this in class, but it was just a limitation of our data set

len(df["2015-04-01"])
573

What was the most popular type of complaint on April 1st?

What were the most popular three types of complaint on April 1st

df["2015-04-01"]["Complaint Type"].value_counts().sort_values(ascending=False).head(3)
Illegal Parking     67
Street Condition    64
Blocked Driveway    58
Name: Complaint Type, dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 200000 entries, 2015-07-06 10:58:27 to 2015-06-09 12:48:25
Data columns (total 54 columns):
Unique Key                        200000 non-null int64
Created Date                      200000 non-null object
Closed Date                       188913 non-null object
Agency                            200000 non-null object
Agency Name                       200000 non-null object
Complaint Type                    200000 non-null object
Descriptor                        198197 non-null object
Location Type                     179328 non-null object
Incident Zip                      181049 non-null object
Incident Address                  152173 non-null object
Street Name                       152152 non-null object
Cross Street 1                    108035 non-null object
Cross Street 2                    107583 non-null object
Intersection Street 1             24790 non-null object
Intersection Street 2             24530 non-null object
Address Type                      177091 non-null object
City                              181095 non-null object
Landmark                          127 non-null object
Facility Type                     80031 non-null object
Status                            199998 non-null object
Due Date                          152018 non-null object
Resolution Description            198936 non-null object
Resolution Action Updated Date    188529 non-null object
Community Board                   200000 non-null object
Borough                           200000 non-null object
X Coordinate (State Plane)        175825 non-null float64
Y Coordinate (State Plane)        175825 non-null float64
Park Facility Name                200000 non-null object
Park Borough                      200000 non-null object
School Name                       200000 non-null object
School Number                     199907 non-null object
School Region                     197128 non-null object
School Code                       197128 non-null object
School Phone Number               200000 non-null object
School Address                    200000 non-null object
School City                       200000 non-null object
School State                      200000 non-null object
School Zip                        199999 non-null object
School Not Found                  151897 non-null object
School or Citywide Complaint      0 non-null float64
Vehicle Type                      34 non-null object
Taxi Company Borough              434 non-null object
Taxi Pick Up Location             3680 non-null object
Bridge Highway Name               1960 non-null object
Bridge Highway Direction          1959 non-null object
Road Ramp                         1946 non-null object
Bridge Highway Segment            2134 non-null object
Garage Lot Name                   143 non-null object
Ferry Direction                   86 non-null object
Ferry Terminal Name               215 non-null object
Latitude                          175825 non-null float64
Longitude                         175825 non-null float64
Location                          175825 non-null object
created_dt                        200000 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(5), int64(1), object(47)
memory usage: 83.9+ MB

What month has the most reports filed? How many? Graph it.

# I'm doing this both by resample and by groupby
df.resample('M').count()
Unique Key Created Date Closed Date Agency Agency Name Complaint Type Descriptor Location Type Incident Zip Incident Address ... Bridge Highway Direction Road Ramp Bridge Highway Segment Garage Lot Name Ferry Direction Ferry Terminal Name Latitude Longitude Location created_dt
created_dt
2015-01-31 7091 7091 6583 7091 7091 7091 7051 6547 6418 5308 ... 76 75 75 7 2 8 6181 6181 6181 7091
2015-02-28 8141 8141 7631 8141 8141 8141 8100 7508 7515 6097 ... 121 121 121 18 4 17 7274 7274 7274 8141
2015-03-31 15025 15025 14305 15025 15025 15025 14931 13742 13833 10775 ... 704 702 702 20 10 22 13444 13444 13444 15025
2015-04-30 20087 20087 19131 20087 20087 20087 19921 17250 17292 13809 ... 311 307 346 15 9 18 16692 16692 16692 20087
2015-05-31 49715 49715 47090 49715 49715 49715 49287 42564 42611 36206 ... 303 301 393 33 17 45 41381 41381 41381 49715
2015-06-30 14459 14459 13416 14459 14459 14459 14341 12274 12474 10460 ... 83 81 99 16 5 18 12067 12067 12067 14459
2015-07-31 15047 15047 13908 15047 15047 15047 14789 14121 14395 11430 ... 75 74 74 13 11 26 13864 13864 13864 15047
2015-08-31 12204 12204 11408 12204 12204 12204 12022 11266 11753 9556 ... 53 52 52 12 12 18 11336 11336 11336 12204
2015-09-30 13679 13679 12911 13679 13679 13679 13492 12790 13024 10769 ... 78 78 85 3 4 10 12551 12551 12551 13679
2015-10-31 24700 24700 23658 24700 24700 24700 24551 23061 23361 21244 ... 88 88 103 2 8 20 23007 23007 23007 24700
2015-11-30 16476 16476 15736 16476 16476 16476 16344 15242 15279 13740 ... 60 60 68 3 4 12 14999 14999 14999 16476
2015-12-31 3373 3373 3134 3373 3373 3373 3365 2960 3091 2776 ... 7 7 16 1 0 1 3026 3026 3026 3373
2016-01-31 3 3 2 3 3 3 3 3 3 3 ... 0 0 0 0 0 0 3 3 3 3

13 rows × 54 columns

df.resample('M').count().plot(y='Unique Key')
<matplotlib.axes._subplots.AxesSubplot at 0x10c07b550>

png

# This is real big, should just look at unique key
#df.groupby(by=df.index.month).count()
df.groupby(by=df.index.month)['Unique Key'].count().sort_values(ascending=False)
5     49715
10    24700
4     20087
11    16476
7     15047
3     15025
6     14459
9     13679
8     12204
2      8141
1      7094
12     3373
Name: Unique Key, dtype: int64
# It's May!
df.groupby(by=df.index.month)['Unique Key'].count().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x10a721ac8>

png

What week of the year has the most reports filed? How many? Graph the weekly complaints.

df.resample('W')['Unique Key'].count().sort_values(ascending=False).head(3)
created_dt
2015-05-10    13559
2015-05-17    11683
2015-05-24    10351
Name: Unique Key, dtype: int64
# All of these do the same thing, but the last one is fastest because it's only resampling one column
#df.resample('W').count().plot(y='Unique Key')
#df.resample('W')['Unique Key'].count().plot()
df['Unique Key'].resample('W').count().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x10c600e48>

png

Noise complaints are a big deal. Use .str.contains to select noise complaints, and make an chart of when they show up annually. Then make a chart about when they show up every day (cyclic).

noise_df = df[df["Complaint Type"].str.contains("Noise")]
len(noise_df)
27824
noise_df['Unique Key'].resample('W').count().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x10968ba90>

png

# By hour of the day
# need to make sure you do by=noise_df.index.hour
# otherwise if you do df.index.house your'e sending it FAR too many groups
noise_df['Unique Key'].groupby(by=noise_df.index.hour).count().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x10ba43cc0>

png

Which were the top five days of the year for filing complaints? How many on each of those days? Graph it.

df['Unique Key'].resample('D').count().sort_values(ascending=False).head(5)
created_dt
2015-10-28    2697
2015-11-09    2529
2015-05-04    2465
2015-05-11    2293
2015-10-29    2258
Name: Unique Key, dtype: int64
df['Unique Key'].resample('D').count().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x10a004048>

png

What hour of the day are the most complaints? Graph a day of complaints.

df['Unique Key'].groupby(by=df.index.hour).count().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x10aabde80>

png

One of the hours has an odd number of complaints. What are the most common complaints at that hour, and what are the most common complaints the hour before and after?

midnight_df = df[df.index.hour == 0]
midnight_df['Complaint Type'].value_counts().sort_values(ascending=False).head(3)
HEAT/HOT WATER    4534
Rodent            2112
PAINT/PLASTER     1946
Name: Complaint Type, dtype: int64
one_am_df = df[df.index.hour == 1]
one_am_df['Complaint Type'].value_counts().sort_values(ascending=False).head(3)
Noise - Commercial         1025
Noise - Street/Sidewalk     897
Blocked Driveway            479
Name: Complaint Type, dtype: int64
eleven_pm_df = df[df.index.hour == 1]
eleven_pm_df['Complaint Type'].value_counts().sort_values(ascending=False).head(3)
Noise - Commercial         1025
Noise - Street/Sidewalk     897
Blocked Driveway            479
Name: Complaint Type, dtype: int64

So odd. What’s the per-minute breakdown of complaints between 12am and 1am? You don’t need to include 1am.

# Get midnight
midnight_df = df[df.index.hour == 0]
# And group it on a per-minute basis
midnight_df.groupby(by=midnight_df.index.minute).count()
Unique Key Created Date Closed Date Agency Agency Name Complaint Type Descriptor Location Type Incident Zip Incident Address ... Bridge Highway Direction Road Ramp Bridge Highway Segment Garage Lot Name Ferry Direction Ferry Terminal Name Latitude Longitude Location created_dt
0 17116 17116 16721 17116 17116 17116 17116 17098 17098 16983 ... 0 0 0 0 0 0 17093 17093 17093 17116
1 109 109 108 109 109 109 109 105 103 90 ... 0 0 0 0 0 0 102 102 102 109
2 91 91 88 91 91 91 90 81 88 72 ... 0 0 1 0 0 0 87 87 87 91
3 99 99 97 99 99 99 99 94 96 83 ... 1 1 1 0 0 0 95 95 95 99
4 106 106 103 106 106 106 105 101 103 93 ... 1 1 1 0 0 0 103 103 103 106
5 94 94 91 94 94 94 93 89 90 80 ... 0 0 0 1 0 0 88 88 88 94
6 106 106 103 106 106 106 105 101 101 90 ... 2 2 2 0 0 0 101 101 101 106
7 106 106 103 106 106 106 106 101 102 90 ... 1 1 2 0 0 0 102 102 102 106
8 95 95 94 95 95 95 95 92 92 80 ... 1 1 1 0 0 0 90 90 90 95
9 82 82 80 82 82 82 81 80 78 71 ... 1 1 1 0 0 0 77 77 77 82
10 89 89 87 89 89 89 89 87 87 78 ... 0 0 0 0 0 0 84 84 84 89
11 101 101 99 101 101 101 100 92 99 84 ... 0 0 1 0 0 0 99 99 99 101
12 100 100 99 100 100 100 97 96 96 81 ... 0 0 0 0 0 0 95 95 95 100
13 100 100 98 100 100 100 99 93 94 83 ... 1 1 1 0 0 0 93 93 93 100
14 88 88 86 88 88 88 88 85 84 74 ... 0 0 0 0 0 0 84 84 84 88
15 100 100 100 100 100 100 99 96 97 83 ... 0 0 0 0 1 1 96 96 96 100
16 83 83 82 83 83 83 83 78 77 63 ... 0 0 0 0 0 0 76 76 76 83
17 93 93 91 93 93 93 92 91 89 81 ... 0 0 0 0 0 0 88 88 88 93
18 91 91 90 91 91 91 91 85 89 77 ... 0 0 0 0 0 0 89 89 89 91
19 93 93 91 93 93 93 92 88 88 76 ... 0 0 0 0 0 0 88 88 88 93
20 100 100 98 100 100 100 99 93 99 85 ... 0 0 0 0 0 0 99 99 99 100
21 109 109 108 109 109 109 109 101 106 96 ... 0 0 0 0 0 0 103 103 103 109
22 104 104 99 104 104 104 104 98 103 91 ... 0 0 1 0 0 0 101 101 101 104
23 92 92 87 92 92 92 92 91 86 80 ... 0 0 0 0 0 0 86 86 86 92
24 79 79 76 79 79 79 78 75 75 67 ... 0 0 0 0 0 0 75 75 75 79
25 100 100 96 100 100 100 100 96 96 91 ... 0 0 0 0 0 0 96 96 96 100
26 84 84 79 84 84 84 83 83 81 72 ... 0 0 0 0 0 0 81 81 81 84
27 94 94 91 94 94 94 94 90 88 81 ... 1 1 2 0 0 0 87 87 87 94
28 94 94 90 94 94 94 94 87 93 83 ... 1 1 1 0 0 0 93 93 93 94
29 87 87 86 87 87 87 86 83 82 65 ... 0 0 0 0 1 1 82 82 82 87
30 89 89 87 89 89 89 88 86 87 73 ... 0 0 0 0 0 0 86 86 86 89
31 89 89 87 89 89 89 89 85 86 79 ... 0 0 0 0 0 0 86 86 86 89
32 87 87 84 87 87 87 87 86 84 74 ... 1 1 1 0 0 0 82 82 82 87
33 98 98 94 98 98 98 94 95 91 79 ... 0 0 0 0 0 0 91 91 91 98
34 77 77 75 77 77 77 77 74 76 69 ... 0 0 0 0 0 0 73 73 73 77
35 89 89 85 89 89 89 88 85 84 75 ... 1 1 1 0 0 0 84 84 84 89
36 78 78 73 78 78 78 78 74 72 66 ... 1 1 1 0 0 0 72 72 72 78
37 84 84 82 84 84 84 82 82 81 69 ... 0 0 0 0 0 0 81 81 81 84
38 91 91 91 91 91 91 91 88 88 82 ... 1 1 1 0 0 1 88 88 88 91
39 106 106 100 106 106 106 106 102 99 88 ... 0 0 0 0 0 0 98 98 98 106
40 108 108 105 108 108 108 108 101 102 96 ... 0 0 0 0 0 0 101 101 101 108
41 95 95 92 95 95 95 95 88 88 78 ... 0 0 0 0 0 0 88 88 88 95
42 72 72 68 72 72 72 72 70 65 63 ... 0 0 0 0 0 0 65 65 65 72
43 89 89 89 89 89 89 87 87 89 81 ... 0 0 0 0 0 0 88 88 88 89
44 97 97 93 97 97 97 97 91 94 78 ... 0 0 0 0 0 0 93 93 93 97
45 81 81 78 81 81 81 80 77 78 67 ... 0 0 0 0 0 0 78 78 78 81
46 93 93 88 93 93 93 93 84 85 75 ... 0 0 0 0 0 0 85 85 85 93
47 76 76 74 76 76 76 76 74 74 64 ... 1 1 1 0 0 0 74 74 74 76
48 82 82 79 82 82 82 82 76 76 66 ... 1 1 2 0 0 0 74 74 74 82
49 70 70 68 70 70 70 69 68 70 60 ... 1 1 1 0 0 0 70 70 70 70
50 112 112 108 112 112 112 111 108 106 96 ... 2 2 2 0 0 0 106 106 106 112
51 75 75 71 75 75 75 75 73 71 61 ... 1 1 1 0 0 0 71 71 71 75
52 85 85 83 85 85 85 85 81 83 74 ... 1 1 1 0 0 1 82 82 82 85
53 83 83 79 83 83 83 83 81 81 70 ... 1 1 1 0 0 0 81 81 81 83
54 70 70 66 70 70 70 70 69 66 61 ... 0 0 0 0 0 0 66 66 66 70
55 60 60 59 60 60 60 60 57 56 47 ... 0 0 0 0 0 0 56 56 56 60
56 89 89 87 89 89 89 89 83 85 75 ... 0 0 0 0 0 0 85 85 85 89
57 76 76 75 76 76 76 76 74 76 65 ... 0 0 0 0 0 0 75 75 75 76
58 61 61 56 61 61 61 61 54 55 50 ... 1 1 1 0 0 0 55 55 55 61
59 80 80 77 80 80 80 80 77 76 66 ... 0 0 0 0 0 0 76 76 76 80

60 rows × 54 columns

Looks like midnight is a little bit of an outlier. Why might that be? Take the 5 most common agencies and graph the times they file reports at (all day, not just midnight).

df['Agency'].value_counts().head(5)
NYPD     80000
HPD      39388
DOT      22308
DPR      15505
DOHMH     8250
Name: Agency, dtype: int64
nypd_df = df[df['Agency'] == 'NYPD']
hpd_df = df[df['Agency'] == 'HPD']
dot_df = df[df['Agency'] == 'DOT']
dpr_df = df[df['Agency'] == 'DPR']
dohmh_df = df[df['Agency'] == 'DOHMH']


ax = nypd_df.groupby(by=nypd_df.index.hour).count().plot(y='Unique Key', label='NYPD')
hpd_df.groupby(by=hpd_df.index.hour).count().plot(y='Unique Key', label='HPD', ax=ax)
dot_df.groupby(by=dot_df.index.hour).count().plot(y='Unique Key', label='DOT', ax=ax)
dpr_df.groupby(by=dpr_df.index.hour).count().plot(y='Unique Key', label='DPR', ax=ax)
dohmh_df.groupby(by=dohmh_df.index.hour).count().plot(y='Unique Key', label='DOHMH', ax=ax)
<matplotlib.axes._subplots.AxesSubplot at 0x10cc97f28>

png

Graph those same agencies on an annual basis - make it weekly. When do people like to complain? When does the NYPD have an odd number of complaints?

nypd_df = df[df['Agency'] == 'NYPD']
hpd_df = df[df['Agency'] == 'HPD']
dot_df = df[df['Agency'] == 'DOT']
dpr_df = df[df['Agency'] == 'DPR']
dohmh_df = df[df['Agency'] == 'DOHMH']

ax = nypd_df.resample('W').count().plot(y='Unique Key', label='NYPD')
hpd_df.resample('W').count().plot(y='Unique Key', ax=ax, label='HPD')
dot_df.resample('W').count().plot(y='Unique Key', ax=ax, label='DOT')
dpr_df.resample('W').count().plot(y='Unique Key', ax=ax, label='DPR')
dohmh_df.resample('W').count().plot(y='Unique Key', ax=ax, label='DOHMH')
<matplotlib.axes._subplots.AxesSubplot at 0x10e84af60>

png

Maybe the NYPD deals with different issues at different times? Check the most popular complaints in July and August vs the month of May. Also check the most common complaints for the Housing Preservation Bureau (HPD) in winter vs. summer.

# July and August
nypd_df["2015-07":"2015-08"]['Complaint Type'].value_counts().head(5)
Illegal Parking            3444
Blocked Driveway           3258
Noise - Street/Sidewalk    3165
Noise - Commercial         1201
Noise - Vehicle             942
Name: Complaint Type, dtype: int64
# May
nypd_df["2015-05"]['Complaint Type'].value_counts().head(5)
Blocked Driveway           4114
Illegal Parking            3975
Noise - Street/Sidewalk    3385
Noise - Commercial         2263
Noise - Vehicle            1232
Name: Complaint Type, dtype: int64
# For getting past the dec/jan boundary in winter...
# You could add together the value_counts()
total = hpd_df["2015-01":"2015-02"]['Complaint Type'].value_counts() + hpd_df["2015-12"]['Complaint Type'].value_counts()
total.sort_values()
APPLIANCE                  32.0
FLOORING/STAIRS            57.0
GENERAL                    66.0
WATER LEAK                 88.0
PAINT/PLASTER             139.0
PLUMBING                  139.0
UNSANITARY CONDITION      190.0
DOOR/WINDOW                 NaN
ELECTRIC                    NaN
HEAT/HOT WATER              NaN
HPD Literature Request      NaN
OUTSIDE BUILDING            NaN
SAFETY                      NaN
Name: Complaint Type, dtype: float64
# Or you could combine the dataframes into a new dataframe using pd.concat
combined_df = pd.concat([hpd_df["2015-01":"2015-02"], hpd_df["2015-12"]])
combined_df['Complaint Type'].value_counts()
HEAT/HOT WATER            353
UNSANITARY CONDITION      190
PLUMBING                  139
PAINT/PLASTER             139
DOOR/WINDOW               103
WATER LEAK                 88
ELECTRIC                   71
GENERAL                    66
FLOORING/STAIRS            57
APPLIANCE                  32
SAFETY                     21
HPD Literature Request     18
OUTSIDE BUILDING            3
Name: Complaint Type, dtype: int64
hpd_df["2015-6":"2015-08"]['Complaint Type'].value_counts().head(5)
HEAT/HOT WATER            617
UNSANITARY CONDITION      510
HPD Literature Request    462
PAINT/PLASTER             444
PLUMBING                  309
Name: Complaint Type, dtype: int64