Debugging Classwork

import pandas as pd
import requests
import dateutil.parser
import datetime

Walkthrough #1

mytime = "0926P"
dateutil.parser.parse(mytime)
---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-5-4234f0872e0f> in <module>()
----> 1 dateutil.parser.parse(mytime)


/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/dateutil/parser.py in parse(timestr, parserinfo, **kwargs)
   1162         return parser(parserinfo).parse(timestr, **kwargs)
   1163     else:
-> 1164         return DEFAULTPARSER.parse(timestr, **kwargs)
   1165 
   1166 


/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/dateutil/parser.py in parse(self, timestr, default, ignoretz, tzinfos, **kwargs)
    575                 repl['day'] = monthrange(cyear, cmonth)[1]
    576 
--> 577         ret = default.replace(**repl)
    578 
    579         if res.weekday is not None and not res.day:


ValueError: hour must be in 0..23
dateutil.parser.parse(mytime + "M")
---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-6-dc0a828bca5e> in <module>()
----> 1 dateutil.parser.parse(mytime + "M")


/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/dateutil/parser.py in parse(timestr, parserinfo, **kwargs)
   1162         return parser(parserinfo).parse(timestr, **kwargs)
   1163     else:
-> 1164         return DEFAULTPARSER.parse(timestr, **kwargs)
   1165 
   1166 


/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/dateutil/parser.py in parse(self, timestr, default, ignoretz, tzinfos, **kwargs)
    575                 repl['day'] = monthrange(cyear, cmonth)[1]
    576 
--> 577         ret = default.replace(**repl)
    578 
    579         if res.weekday is not None and not res.day:


ValueError: hour must be in 0..23
newtime = "../09BLAHBLAH26PM"
#dateutil.parser.parse(newtime)
datetime.datetime.strptime(newtime,"%IBLAHBLAH%M%p")
datetime.datetime(1900, 1, 1, 21, 26)
def convert_to_time(str_time):
    with_pm = str_time + "M"
    return datetime.datetime.strptime(with_pm,"%I%M%p")
timestring = "0929P"
convert_to_time(timestring)
datetime.datetime(1900, 1, 1, 21, 29)
convert_to_time("1201A")
datetime.datetime(1900, 1, 1, 0, 1)

Walkthrough #2

# Read in the dataframe
times_df = pd.read_csv("times-and-serials.csv")
times_df.head()
Owner Time Serial Number
0 Daniella 8:35 AM 7754
1 Carleen 7:46 AM 6881
2 Daron 1:35 AM 4509
3 Cherly 1:35 AM 2310
4 Manda 1:35 AM 4362
# Try out a way of parsing one of the dates
datetime.datetime.strptime("8:35 AM", "%I:%M %p")
datetime.datetime(1900, 1, 1, 8, 35)
import numpy as np

# Build a function using that method
def time_to_datetime(str_time):
    try:
        #print("Trying to convert", str_time, "into a time")
        if str_time == '-999':
            #print("It's -999")
            return np.nan
        #print("It's not -999")
        return datetime.datetime.strptime(str_time.strip(), "%I:%M %p")
    except:
        return np.nan
# Apply that method to the 'Time' column of the dataframe
times_df['Time'].apply(time_to_datetime)
0    1900-01-01 08:35:00
1    1900-01-01 07:46:00
2    1900-01-01 01:35:00
3    1900-01-01 01:35:00
4    1900-01-01 01:35:00
5    1900-01-01 12:57:00
6    1900-01-01 03:49:00
7                    NaT
8    1900-01-01 00:36:00
9    1900-01-01 17:19:00
10   1900-01-01 21:20:00
11   1900-01-01 09:32:00
12   1900-01-01 17:19:00
13   1900-01-01 09:57:00
14   1900-01-01 21:33:00
15                   NaT
16   1900-01-01 20:32:00
17   1900-01-01 21:19:00
18   1900-01-01 00:58:00
19   1900-01-01 06:40:00
20   1900-01-01 02:47:00
21   1900-01-01 17:19:00
22   1900-01-01 15:38:00
23                   NaT
24   1900-01-01 05:58:00
25   1900-01-01 04:16:00
26   1900-01-01 09:03:00
27   1900-01-01 22:19:00
28   1900-01-01 21:04:00
29                   NaT
30   1900-01-01 09:12:00
31   1900-01-01 01:17:00
32   1900-01-01 22:35:00
33   1900-01-01 13:39:00
34   1900-01-01 20:58:00
35   1900-01-01 12:45:00
36                   NaT
37   1900-01-01 20:19:00
38   1900-01-01 18:41:00
39   1900-01-01 07:40:00
40   1900-01-01 09:43:00
41   1900-01-01 10:51:00
42   1900-01-01 12:07:00
43   1900-01-01 22:58:00
44   1900-01-01 16:31:00
Name: Time, dtype: datetime64[ns]
# Apply that method to the 'Time' column of the dataframe
times_df['converted_time'] = times_df['Time'].apply(time_to_datetime)
# Let's take a peek at our new column
times_df.head(10)
Owner Time Serial Number converted_time
0 Daniella 8:35 AM 7754 1900-01-01 08:35:00
1 Carleen 7:46 AM 6881 1900-01-01 07:46:00
2 Daron 1:35 AM 4509 1900-01-01 01:35:00
3 Cherly 1:35 AM 2310 1900-01-01 01:35:00
4 Manda 1:35 AM 4362 1900-01-01 01:35:00
5 Keri 12:57 PM 3360 1900-01-01 12:57:00
6 Frank 3:49 AM 5901 1900-01-01 03:49:00
7 Berneice -999 6995 NaT
8 Janis 12:36 AM 4788 1900-01-01 00:36:00
9 Tosha 5:19 PM 2585 1900-01-01 17:19:00
# Let's look at all of the columns where converted time
# didn't end up working out
times_df[pd.isnull(times_df['converted_time'])]
Owner Time Serial Number converted_time
7 Berneice -999 6995 NaT
15 Renato GERTRUDE 3226 NaT
23 Monserrate 45:18 PM 5634 NaT
29 Brianne 527 0 NaT
36 Meggan 0:17 AM 5241 NaT
# don't do this it won't work
# if whatever == 'NaN'
# do this: np.isnull(whatever)
import numpy as np
nan
%pdb on
Automatic pdb calling has been turned ON

Walkthrough #4

  1. I want to make sure my Plate ID is a string. Can’t lose the leading zeroes!
  2. I don’t think anyone’s car was built in 0AD. Discard the ‘0’s as NaN.
  3. I want the dates to be dates! Read the read_csv documentation to find out how to make pandas automatically parse dates.
  4. “Date first observed” is a pretty weird column, but it seems like it has a date hiding inside. Using a function with .apply, transform the string (e.g. “20140324”) into a Python date. Make the 0’s show up as NaN.
  5. “Violation time” is… not a time. Make it a time.
  6. There sure are a lot of colors of cars, too bad so many of them are the same. Make “BLK” and “BLACK”, “WT” and “WHITE”, and any other combinations that you notice.
  7. Join the data with the Parking Violations Code dataset from the NYC Open Data site.

read_csv documentation can be found at http://pandas.pydata.org/pandas- docs/stable/generated/pandas.read_csv.html

# import numpy as np
df = pd.read_csv("../million-violations.csv", nrows=1000000)
df.head()