import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import dateutil.parser
# Data from https://nycopendata.socrata.com/Social-Services/311-Service-Requests-from-2010-to-Present/erm2-nwe9
df = pd.read_csv("../311.csv")
/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (17,39,40,41,47,48,49) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
len(df)
199999
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199999 entries, 0 to 199998
Data columns (total 53 columns):
Unique Key                        199999 non-null int64
Created Date                      199999 non-null object
Closed Date                       195983 non-null object
Agency                            199999 non-null object
Agency Name                       199999 non-null object
Complaint Type                    199999 non-null object
Descriptor                        199132 non-null object
Location Type                     78180 non-null object
Incident Zip                      171566 non-null float64
Incident Address                  125786 non-null object
Street Name                       125785 non-null object
Cross Street 1                    132623 non-null object
Cross Street 2                    130444 non-null object
Intersection Street 1             71467 non-null object
Intersection Street 2             71446 non-null object
Address Type                      195273 non-null object
City                              171847 non-null object
Landmark                          9 non-null object
Facility Type                     29626 non-null object
Status                            199999 non-null object
Due Date                          22279 non-null object
Resolution Description            197477 non-null object
Resolution Action Updated Date    197302 non-null object
Community Board                   199999 non-null object
Borough                           199999 non-null object
X Coordinate (State Plane)        170540 non-null float64
Y Coordinate (State Plane)        170540 non-null float64
Park Facility Name                199999 non-null object
Park Borough                      199999 non-null object
School Name                       199999 non-null object
School Number                     199962 non-null object
School Region                     199715 non-null object
School Code                       199715 non-null object
School Phone Number               199999 non-null object
School Address                    199999 non-null object
School City                       199999 non-null object
School State                      199999 non-null object
School Zip                        199999 non-null object
School Not Found                  22080 non-null object
School or Citywide Complaint      13 non-null object
Vehicle Type                      12 non-null object
Taxi Company Borough              64 non-null object
Taxi Pick Up Location             444 non-null object
Bridge Highway Name               155 non-null object
Bridge Highway Direction          155 non-null object
Road Ramp                         151 non-null object
Bridge Highway Segment            160 non-null object
Garage Lot Name                   28 non-null object
Ferry Direction                   9 non-null object
Ferry Terminal Name               65 non-null object
Latitude                          170540 non-null float64
Longitude                         170540 non-null float64
Location                          170540 non-null object
dtypes: float64(5), int64(1), object(47)
memory usage: 80.9+ MB
def parse_date(str_date):
    return dateutil.parser.parse(str_date)

df['happened_at'] = df['Created Date'].apply(parse_date)
# How many ended up as not being parsed?
pd.isnull(df['happened_at']).value_counts()
False    199999
Name: happened_at, dtype: int64
# Move the index over, delete 'happened_at'
df.index = df['happened_at']
del df['happened_at']
# What's our range looking like?
print(df.index.min())
print(df.index.max())
2010-01-04 00:00:00
2016-05-20 01:13:15
# Let's group by year and see how many we have
df.resample("A").count()
Unique Key Created Date Closed Date Agency Agency Name Complaint Type Descriptor Location Type Incident Zip Incident Address ... Bridge Highway Name Bridge Highway Direction Road Ramp Bridge Highway Segment Garage Lot Name Ferry Direction Ferry Terminal Name Latitude Longitude Location
happened_at
2010-12-31 89509 89509 88895 89509 89509 89509 89441 26313 75468 54647 ... 9 9 9 9 6 0 31 74914 74914 74914
2011-12-31 78604 78604 77998 78604 78604 78604 78581 25783 66446 45087 ... 17 17 17 17 6 0 13 66529 66529 66529
2012-12-31 17 17 16 17 17 17 17 12 15 13 ... 0 0 0 0 0 0 0 15 15 15
2013-12-31 17 17 9 17 17 17 17 10 13 13 ... 1 1 1 1 0 0 0 12 12 12
2014-12-31 189 189 139 189 189 189 189 103 173 180 ... 0 0 0 0 0 0 0 173 173 173
2015-12-31 2069 2069 1919 2069 2069 2069 2068 1031 1820 1623 ... 0 0 0 0 0 0 0 1811 1811 1811
2016-12-31 29594 29594 27007 29594 29594 29594 28819 24928 27631 24223 ... 128 128 124 133 16 9 21 27086 27086 27086

7 rows × 53 columns

# Since we don't have the entire data set we're missing a lot.
df.resample("A").count().plot(y='Unique Key')
<matplotlib.axes._subplots.AxesSubplot at 0x10b69c0b8>

png

# Let's just grab 2010 then
df_2010 = df["2010"]
df_2010.head(2)
Unique Key Created Date Closed Date Agency Agency Name Complaint Type Descriptor Location Type Incident Zip Incident Address ... Bridge Highway Name Bridge Highway Direction Road Ramp Bridge Highway Segment Garage Lot Name Ferry Direction Ferry Terminal Name Latitude Longitude Location
happened_at
2010-11-18 10:53:00 19157621 11/18/2010 10:53:00 AM 11/18/2010 10:53:00 AM DOT Department of Transportation Street Light Condition Street Light Out NaN 11422.0 NaN ... NaN NaN NaN NaN NaN NaN NaN 40.654108 -73.746911 (40.654107999915, -73.74691077345408)
2010-11-01 17:55:00 19025135 11/01/2010 05:55:00 PM 11/01/2010 08:00:00 PM DEP Department of Environmental Protection Asbestos Asbestos Complaint (B1) NaN 10036.0 235 W 46 ST ... NaN NaN NaN NaN NaN NaN NaN 40.759129 -73.986341 (40.759128565735466, -73.98634107118339)

2 rows × 53 columns

df_2010.resample("B").count()
Unique Key Created Date Closed Date Agency Agency Name Complaint Type Descriptor Location Type Incident Zip Incident Address ... Bridge Highway Name Bridge Highway Direction Road Ramp Bridge Highway Segment Garage Lot Name Ferry Direction Ferry Terminal Name Latitude Longitude Location
happened_at
2010-01-04 4 4 4 4 4 4 4 0 4 3 ... 0 0 0 0 0 0 0 4 4 4
2010-01-05 1 1 1 1 1 1 1 0 1 1 ... 0 0 0 0 0 0 0 1 1 1
2010-01-06 1 1 1 1 1 1 1 1 1 1 ... 0 0 0 0 0 0 0 1 1 1
2010-01-07 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2010-01-08 1 1 1 1 1 1 1 0 1 1 ... 0 0 0 0 0 0 0 1 1 1
2010-01-11 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2010-01-12 1 1 1 1 1 1 1 0 1 1 ... 0 0 0 0 0 0 0 1 1 1
2010-01-13 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2010-01-14 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2010-01-15 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2010-01-18 2 2 2 2 2 2 2 0 1 1 ... 0 0 0 0 0 0 0 1 1 1
2010-01-19 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2010-01-20 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2010-01-21 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2010-01-22 1 1 1 1 1 1 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2010-01-25 2 2 2 2 2 2 2 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2010-01-26 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2010-01-27 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2010-01-28 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2010-01-29 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2010-02-01 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2010-02-02 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2010-02-03 1 1 1 1 1 1 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2010-02-04 2 2 2 2 2 2 2 0 1 1 ... 0 0 0 0 0 0 0 1 1 1
2010-02-05 6 6 6 6 6 6 6 1 3 3 ... 0 0 0 0 0 0 0 3 3 3
2010-02-08 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2010-02-09 2 2 2 2 2 2 2 0 1 2 ... 0 0 0 0 0 0 0 1 1 1
2010-02-10 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2010-02-11 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2010-02-12 1 1 1 1 1 1 1 0 1 1 ... 0 0 0 0 0 0 0 1 1 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2010-11-22 1767 1767 1749 1767 1767 1767 1767 425 1495 1081 ... 1 1 1 1 0 0 0 1495 1495 1495
2010-11-23 1773 1773 1764 1773 1773 1773 1773 522 1560 1103 ... 0 0 0 0 0 0 0 1562 1562 1562
2010-11-24 1685 1685 1677 1685 1685 1685 1684 444 1398 999 ... 0 0 0 0 0 0 0 1398 1398 1398
2010-11-25 493 493 492 493 493 493 493 111 448 339 ... 0 0 0 0 0 0 0 450 450 450
2010-11-26 2342 2342 2326 2342 2342 2342 2342 749 2122 1644 ... 2 2 2 2 0 0 0 2124 2124 2124
2010-11-29 1731 1731 1726 1731 1731 1731 1729 455 1452 1089 ... 0 0 0 0 0 0 1 1451 1451 1451
2010-11-30 1853 1853 1847 1853 1853 1853 1852 468 1571 1190 ... 0 0 0 0 0 0 1 1575 1575 1575
2010-12-01 2441 2441 2432 2441 2441 2441 2441 459 1928 1328 ... 1 1 1 1 0 0 0 1928 1928 1928
2010-12-02 1920 1920 1912 1920 1920 1920 1919 504 1607 1204 ... 1 1 1 1 0 0 1 1609 1609 1609
2010-12-03 3348 3348 3336 3348 3348 3348 3346 1112 2896 2233 ... 0 0 0 0 0 0 1 2898 2898 2898
2010-12-06 2009 2009 1997 2009 2009 2009 2009 509 1719 1205 ... 0 0 0 0 0 0 0 1721 1721 1721
2010-12-07 1862 1862 1846 1862 1862 1862 1861 477 1495 1084 ... 0 0 0 0 0 0 1 1495 1495 1495
2010-12-08 1940 1940 1930 1940 1940 1940 1939 549 1651 1271 ... 0 0 0 0 0 0 1 1653 1653 1653
2010-12-09 1982 1982 1968 1982 1982 1982 1979 447 1526 1130 ... 0 0 0 0 0 0 3 1526 1526 1526
2010-12-10 3127 3127 3105 3127 3127 3127 3126 926 2738 2036 ... 0 0 0 0 2 0 1 2739 2739 2739
2010-12-13 2058 2058 2043 2058 2058 2058 2058 478 1655 1092 ... 0 0 0 0 0 0 0 1656 1656 1656
2010-12-14 1978 1978 1965 1978 1978 1978 1977 497 1541 1048 ... 0 0 0 0 0 0 0 1541 1541 1541
2010-12-15 1958 1958 1947 1958 1958 1958 1958 619 1642 1164 ... 1 1 1 1 0 0 0 1642 1642 1642
2010-12-16 2080 2080 2051 2080 2080 2080 2078 554 1739 1245 ... 0 0 0 0 0 0 2 1739 1739 1739
2010-12-17 3245 3245 3228 3245 3245 3245 3242 903 2700 1900 ... 0 0 0 0 1 0 1 2701 2701 2701
2010-12-20 1790 1790 1785 1790 1790 1790 1789 425 1456 1010 ... 0 0 0 0 0 0 0 1456 1456 1456
2010-12-21 1907 1907 1891 1907 1907 1907 1906 487 1559 1133 ... 0 0 0 0 0 0 1 1560 1560 1560
2010-12-22 1746 1746 1741 1746 1746 1746 1745 386 1401 1033 ... 0 0 0 0 0 0 0 1402 1402 1402
2010-12-23 1496 1496 1492 1496 1496 1496 1496 364 1181 832 ... 0 0 0 0 0 0 0 1182 1182 1182
2010-12-24 1336 1336 1334 1336 1336 1336 1334 470 1174 890 ... 1 1 1 1 0 0 2 1175 1175 1175
2010-12-27 394 394 394 394 394 394 394 149 339 243 ... 0 0 0 0 0 0 0 339 339 339
2010-12-28 969 969 966 969 969 969 969 303 736 514 ... 0 0 0 0 0 0 0 736 736 736
2010-12-29 1252 1252 1247 1252 1252 1252 1251 510 1058 738 ... 0 0 0 0 0 0 1 1058 1058 1058
2010-12-30 1872 1872 1851 1872 1872 1872 1870 991 1633 1014 ... 1 1 1 1 0 0 1 1633 1633 1633
2010-12-31 1475 1475 1473 1475 1475 1475 1475 1120 1398 943 ... 0 0 0 0 0 0 0 1398 1398 1398

260 rows × 53 columns

# Well this is the downside of incomplete data!
df_2010.resample("B").count().plot(y='Unique Key')
<matplotlib.axes._subplots.AxesSubplot at 0x10d97a5f8>

png

df.groupby(df.index.hour).count().plot(y='Unique Key')
<matplotlib.axes._subplots.AxesSubplot at 0x10aeb39b0>

png

noise_df = df[df['Complaint Type'].str.contains("Noise")]
noise_df.groupby(noise_df.index.hour).count().plot(y='Unique Key')
<matplotlib.axes._subplots.AxesSubplot at 0x10dd584a8>

png

# Combine the two
noise_df = df[df['Complaint Type'].str.contains("Noise")]
ax = df.groupby(df.index.hour).count().plot(y='Unique Key')
noise_df.groupby(noise_df.index.hour).count().plot(y='Unique Key', ax=ax)
<matplotlib.axes._subplots.AxesSubplot at 0x10e8587b8>

png

df['Complaint Type'].value_counts()
Street Light Condition                   30886
Traffic Signal Condition                 18492
Street Condition                         18024
Snow                                     13831
Water System                             13623
Sewer                                    10798
Dirty Conditions                          8298
Noise                                     6858
HEATING                                   6172
General Construction/Plumbing             5940
Sanitation Condition                      5023
Building/Use                              5015
Missed Collection (All Materials)         4057
Graffiti                                  3574
HEAT/HOT WATER                            3264
Noise - Residential                       2753
Derelict Vehicles                         2742
Blocked Driveway                          2602
Illegal Parking                           2596
PLUMBING                                  1590
Air Quality                               1526
Special Enforcement                       1237
Elevator                                  1214
Broken Muni Meter                         1196
Other Enforcement                         1169
Lead                                      1123
Noise - Commercial                        1088
GENERAL CONSTRUCTION                      1057
Plumbing                                   951
Water Conservation                         913
                                         ...  
Sweeping/Inadequate                          2
Discipline and Suspension                    2
Ferry Complaint                              2
Bike Rack Condition                          2
DOF Property - RPIE Issue                    2
Complaint                                    2
Illegal Fireworks                            2
Special Natural Area District (SNAD)         2
Literature Request                           1
Public Toilet                                1
Radioactive Material                         1
Poison Ivy                                   1
Parking Card                                 1
Unsanitary Animal Facility                   1
DOF Parking - Address Update                 1
Health                                       1
Research Questions                           1
Sprinkler - Mechanical                       1
Bottled Water                                1
Hazmat Storage/Use                           1
Weatherization                               1
Tunnel Condition                             1
Transportation Provider Complaint            1
Rangehood                                    1
Construction                                 1
Home Delivered Meal - Missed Delivery        1
Animal Facility - No Permit                  1
Lifeguard                                    1
Forensic Engineering                         1
Beach/Pool/Sauna Complaint                   1
Name: Complaint Type, dtype: int64