import pandas as pd
df = pd . read_csv ( "../Civil_List_2014.csv" ,
names = [ "DPT" , "NAME" , "ADDRESS" , "TTL" , "PC" , "SAL-RATE" ],
skiprows = 1 ,
dtype = col_types )
df . head ( 5 )
DPT
NAME
ADDRESS
TTL
PC
SAL-RATE
0
868
B J SANDIFORD
DEPARTMENT OF CITYWIDE ADM
12702
X
$5.00
1
868
C A WIGFALL
DEPARTMENT OF CITYWIDE ADM
12702
X
$5.00
2
69
A E A-AWOSOGBA
HRA/DEPARTMENT OF SOCIAL S
52311
A
$51955.00
3
868
K D AABY
DEPARTMENT OF CITYWIDE ADM
10209
X
$12.00
4
56
I D AADIL
POLICE DEPARTMENT
71012
A
$46953.00
df . head ( 10 )
DPT
NAME
ADDRESS
TTL
PC
SAL-RATE
0
DPT
NAME
ADDRESS
TTL #
PC
SAL-RATE
1
868
B J SANDIFORD
DEPARTMENT OF CITYWIDE ADM
12702
X
$5.00
2
868
C A WIGFALL
DEPARTMENT OF CITYWIDE ADM
12702
X
$5.00
3
69
A E A-AWOSOGBA
HRA/DEPARTMENT OF SOCIAL S
52311
A
$51955.00
4
868
K D AABY
DEPARTMENT OF CITYWIDE ADM
10209
X
$12.00
5
56
I D AADIL
POLICE DEPARTMENT
71012
A
$46953.00
6
69
M AAKIRI
HRA/DEPARTMENT OF SOCIAL S
56056
A
$33000.00
7
464
A AALAI
CUNY QUEENSBOROUGH COMMUNI
4607
N
$73.53
8
998
A V AALEVIK
N.Y.C. TRANSIT AUTHORITY
402
2
$33280.00
9
998
M AAMIR
N.Y.C. TRANSIT AUTHORITY
00T07
4
$60878.00
df [ 'SAL-RATE' ] . describe ()
count 269838
unique 27533
top $76488.00
freq 26024
Name: SAL-RATE, dtype: object
df [ 'PC' ] . value_counts ()
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
1944 try:
-> 1945 return self._engine.get_loc(key)
1946 except KeyError:
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4154)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4018)()
pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12368)()
pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12322)()
KeyError: 'PC'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-6-b75b76613fc7> in <module>()
----> 1 df['PC'].value_counts()
/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/frame.py in __getitem__(self, key)
1995 return self._getitem_multilevel(key)
1996 else:
-> 1997 return self._getitem_column(key)
1998
1999 def _getitem_column(self, key):
/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/frame.py in _getitem_column(self, key)
2002 # get column
2003 if self.columns.is_unique:
-> 2004 return self._get_item_cache(key)
2005
2006 # duplicate columns & possible reduce dimensionality
/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
1348 res = cache.get(item)
1349 if res is None:
-> 1350 values = self._data.get(item)
1351 res = self._box_item_values(item, values)
1352 cache[item] = res
/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/internals.py in get(self, item, fastpath)
3288
3289 if not isnull(item):
-> 3290 loc = self.items.get_loc(item)
3291 else:
3292 indexer = np.arange(len(self.items))[isnull(self.items)]
/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
1945 return self._engine.get_loc(key)
1946 except KeyError:
-> 1947 return self._engine.get_loc(self._maybe_cast_indexer(key))
1948
1949 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4154)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4018)()
pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12368)()
pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12322)()
KeyError: 'PC'
df . columns
Index(['DPT ', 'NAME ', 'ADDRESS ', 'TTL # ', 'PC ', 'SAL-RATE'], dtype='object')
"DPT " . strip ()
for col in df . columns :
print ( col . strip ())
DPT
NAME
ADDRESS
TTL #
PC
SAL-RATE
# it's a list, so - []
# it's a loop, so - [for col in df.columns]
# col.strip(), so - [col.strip() for col in columns]
[ col . strip () for col in df . columns ]
['DPT', 'NAME', 'ADDRESS', 'TTL #', 'PC', 'SAL-RATE']
df . columns = [ col . strip () for col in df . columns ]
df . columns
Index(['DPT', 'NAME', 'ADDRESS', 'TTL #', 'PC', 'SAL-RATE'], dtype='object')
df [ 'PC' ] . value_counts ()
A 100521
D 54881
X 24403
4 23259
3 19866
1 13083
K 7635
F 6361
N 4671
U 3409
B 2817
BW 1766
E 1300
W 1257
G 1085
I 730
2 721
M 708
O 536
WK 488
P 307
L 25
S 4
0 3
V 2
Name: PC, dtype: int64
df [ 'SAL-RATE' ] . describe ()
count 269838
unique 27533
top $76488.00
freq 26024
Name: SAL-RATE, dtype: object
def money_to_float ( money_str ):
return float ( money_str . replace ( "$" , "" ) . replace ( "," , "" ))
print ( money_to_int ( "$76488.00" ))
print ( money_to_int ( "$5.00" ))
print ( money_to_int ( "$1,500.00" ))
#money_to_float(df['SAL-RATE'])
df [ 'SAL-RATE' ] . apply ( money_to_float )
0 5.00
1 5.00
2 51955.00
3 12.00
4 46953.00
5 33000.00
6 73.53
7 33280.00
8 60878.00
9 69272.00
10 89960.00
11 38324.00
12 39019.00
13 369.92
14 66127.00
15 64250.00
16 32510.00
17 2.65
18 109087.00
19 111606.00
20 5.00
21 56555.00
22 51643.00
23 77015.00
24 76488.00
25 67627.00
26 68095.00
27 66127.00
28 76689.00
29 48153.00
...
269808 85287.00
269809 65564.00
269810 56937.00
269811 81439.00
269812 118506.00
269813 42064.00
269814 33.18
269815 116364.00
269816 98072.00
269817 107563.00
269818 37150.00
269819 74646.00
269820 64040.00
269821 76488.00
269822 83802.00
269823 17.16
269824 85667.00
269825 46.89
269826 67.42
269827 51916.00
269828 57514.00
269829 74605.00
269830 74605.00
269831 64403.00
269832 51936.00
269833 47.58
269834 87738.00
269835 78683.00
269836 76488.00
269837 5.00
Name: SAL-RATE, dtype: float64
df [ 'salary' ] = df [ 'SAL-RATE' ] . apply ( money_to_float )
df . head ()
DPT
NAME
ADDRESS
TTL #
PC
SAL-RATE
salary
0
868
B J SANDIFORD
DEPARTMENT OF CITYWIDE ADM
12702
X
$5.00
5.0
1
868
C A WIGFALL
DEPARTMENT OF CITYWIDE ADM
12702
X
$5.00
5.0
2
69
A E A-AWOSOGBA
HRA/DEPARTMENT OF SOCIAL S
52311
A
$51955.00
51955.0
3
868
K D AABY
DEPARTMENT OF CITYWIDE ADM
10209
X
$12.00
12.0
4
56
I D AADIL
POLICE DEPARTMENT
71012
A
$46953.00
46953.0
df [ 'salary' ] . describe ()
count 269838.000000
mean 54143.707486
std 32345.656289
min 0.500000
25% 36899.000000
50% 57855.000000
75% 76488.000000
max 300000.000000
Name: salary, dtype: float64
% matplotlib inline
df [ 'salary' ] . hist ( bins = 50 )
<matplotlib.axes._subplots.AxesSubplot at 0x10c3177b8>
df [ 'ADDRESS' ] . value_counts ()
N.Y.C. TRANSIT AUTHORITY 56932
POLICE DEPARTMENT 51453
FIRE DEPARTMENT 15848
DEPARTMENT OF CITYWIDE ADM 15739
HRA/DEPARTMENT OF SOCIAL S 13676
DEPARTMENT OF EDUCATION 11689
N.Y.C. HOUSING AUTHORITY 11501
DEPARTMENT OF CORRECTION 10188
DEPARTMENT OF SANITATION 9666
DEPARTMENT OF PARKS & RECR 7261
ADMINISTRATION FOR CHILDRE 6033
DEPARTMENT OF ENVIRONMENTA 5717
DEPARTMENT OF HEALTH AND M 5676
DEPARTMENT OF TRANSPORTATI 4589
CUNY LAGUARDIA COMMUNITY C 3379
CUNY KINGSBOROUGH COMMMUNI 3260
CUNY MANHATTAN COMMUNITY C 2974
TRIBOROUGH BRIDGE AND TUNN 2254
HOUSING PRESERVATION & DEV 2078
CUNY QUEENSBOROUGH COMMUNI 2030
DEPARTMENT OF HOMELESS SER 1966
DEPARTMENT OF FINANCE 1953
CUNY BRONX COMMUNITY COLLE 1923
LAW DEPARTMENT 1514
DISTRICT ATTORNEY-NEW YORK 1384
CUNY HOSTOS COMMUNITY COLL 1295
DEPARTMENT OF DESIGN AND C 1193
DEPARTMENT OF INFORMATION 1162
DEPARTMENT OF BUILDINGS 1111
DISTRICT ATTORNEY-KINGS CO 1062
...
COMMUNITY BOARD NO.12-BROO 3
COMMUNITY BOARD NO.4-QUEEN 3
COMMUNITY BOARD NO.1-BROOK 3
COMMUNITY BOARD NO.13-QUEE 3
COMMUNITY BOARD NO.12-BRON 3
COMMUNITY BOARD NO.11-QUEE 3
COMMUNITY BOARD NO.2-BROOK 3
COMMUNITY BOARD NO.14-QUEE 3
COMMUNITY BOARD NO.7-MANHA 3
COMMUNITY BOARD N0.9-MANHA 3
COMMUNITY BOARD NO.6-BROOK 3
COMMUNITY BOARD NO.3-BROOK 3
COMMUNITY BOARD NO.8-BRONX 3
COMMUNITY BOARD NO.11-BROO 3
COMMUNITY BOARD NO.4-MANHA 3
COMMUNITY BOARD NO.8-BROOK 3
COMMUNITY BOARD NO.12-QUEE 3
COMMUNITY BOARD NO.10-BROO 3
COMMUNITY BOARD NO.1-RICHM 3
COMMUNITY BOARD NO.14-BROO 3
COMMUNITY BOARD NO.6-BRONX 2
COMMUNITY BOARD NO.9-QUEEN 2
COMMUNITY BOARD NO.15-BROO 2
COMMUNITY BOARD NO.1 BRONX 2
CUNY MEDGAR EVERS COLLEGE 2
COMMUNITY BOARD NO.9-BROOK 2
COMMUNITY BOARD NO.3-BRONX 2
COMMUNITY BOARD NO.6-MANHA 2
COMMUNITY BOARD NO.16-BROO 2
COMMUNITY BOARD NO.5-BRONX 2
Name: ADDRESS, dtype: int64
df . head ()
DPT
NAME
ADDRESS
TTL #
PC
SAL-RATE
salary
0
868
B J SANDIFORD
DEPARTMENT OF CITYWIDE ADM
12702
X
$5.00
5.0
1
868
C A WIGFALL
DEPARTMENT OF CITYWIDE ADM
12702
X
$5.00
5.0
2
69
A E A-AWOSOGBA
HRA/DEPARTMENT OF SOCIAL S
52311
A
$51955.00
51955.0
3
868
K D AABY
DEPARTMENT OF CITYWIDE ADM
10209
X
$12.00
12.0
4
56
I D AADIL
POLICE DEPARTMENT
71012
A
$46953.00
46953.0
agencies_df = pd . read_csv ( "../cleaned-agencies.csv" , dtype = 'str' )
agencies_df . head ()
code
agency
is_mayoral
0
002
OFFICE OF THE MAYOR
N
1
003
BOARD OF ELECTIONS
Y
2
004
CAMPAIGN FINANCE BOARD
Y
3
005
BOARD OF ESTIMATE-SECRETARY
Y
4
006
MAYOR'S OFFICE OF OPERATIONS
N
col_types = { 'code' : 'str' }
agencies_df = pd . read_csv ( "../cleaned-agencies.csv" , dtype = col_types )
agencies_df . head ()
code
agency
is_mayoral
0
002
OFFICE OF THE MAYOR
N
1
003
BOARD OF ELECTIONS
Y
2
004
CAMPAIGN FINANCE BOARD
Y
3
005
BOARD OF ESTIMATE-SECRETARY
Y
4
006
MAYOR'S OFFICE OF OPERATIONS
N
col_types = { 'code' : 'str' }
agencies_df = pd . read_csv ( "../cleaned-agencies.csv" , dtype = col_types , true_values = 'Y' , false_values = 'N' )
agencies_df . head ()
code
agency
is_mayoral
0
-999
THIS IS FAKE
False
1
-999
THIS IS ALSO FAKE
True
2
-999
THESE ARE ALL FAKE
False
3
002
OFFICE OF THE MAYOR
False
4
003
BOARD OF ELECTIONS
True
col_types = { 'code' : 'str' }
agencies_df = pd . read_csv ( "../cleaned-agencies.csv" , na_values = [ "-999" ], dtype = col_types , true_values = 'Y' , false_values = 'N' )
agencies_df . head ()
code
agency
is_mayoral
0
NaN
THIS IS FAKE
False
1
NaN
THIS IS ALSO FAKE
True
2
NaN
THESE ARE ALL FAKE
False
3
002
OFFICE OF THE MAYOR
False
4
003
BOARD OF ELECTIONS
True
agencies_df = pd . read_csv ( "../cleaned-agencies.csv" )
agencies_df . head ()
code
agency
is_mayoral
0
-999
THIS IS FAKE
N
1
-999
THIS IS ALSO FAKE
Y
2
-999
THESE ARE ALL FAKE
N
3
2
OFFICE OF THE MAYOR
N
4
3
BOARD OF ELECTIONS
Y
df . head ()
DPT
NAME
ADDRESS
TTL
PC
SAL-RATE
0
868
B J SANDIFORD
DEPARTMENT OF CITYWIDE ADM
12702
X
$5.00
1
868
C A WIGFALL
DEPARTMENT OF CITYWIDE ADM
12702
X
$5.00
2
69
A E A-AWOSOGBA
HRA/DEPARTMENT OF SOCIAL S
52311
A
$51955.00
3
868
K D AABY
DEPARTMENT OF CITYWIDE ADM
10209
X
$12.00
4
56
I D AADIL
POLICE DEPARTMENT
71012
A
$46953.00
df . merge ( agencies_df , left_on = 'DPT' , right_on = 'code' )
DPT
NAME
ADDRESS
TTL
PC
SAL-RATE
code
agency
is_mayoral