In [1]:

# The usual preamble
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Make the graphs a bit prettier, and bigger
pd.set_option('display.mpl_style', 'default')
plt.rcParams['figure.figsize'] = (15, 5)
plt.rcParams['font.family'] = 'sans-serif'

# This is necessary to show lots of columns in pandas 0.12. 
# Not necessary in pandas 0.13.
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

关于脏数据最头疼的一个问题是，我怎么知道数据是“脏”的？

接下来我们会继续使用NYC 311数据，因为它足够大并且处理起来有点棘手。

In [2]:

requests = pd.read_csv('../data/311-service-requests.csv')

/home/tianjun/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (8) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)

7.1 怎么知道它是“脏”数据？¶

接下来选取其中的几列来看看，我已经知道其中的邮编有问题，所以先看看这个。

为了知道是不是有问题，我先用.unique()来列出其所有值。如果这个字段是一个数值的话，我一般会画个直方图来看看它的分布。

查看“Incident Zip”之后，显然这个字段是有许多问题的：

其中有些是浮点型数据，另一些却转成了字符串类型
其中还包含有nan型的数据
某些编码包含比较长的29616-0759 或是只有很短的两位 83
还有些空值并没有被pandas处理，而是转成了字符串类型比如'N/A' 和'NO CLUE'

我们要做的是：

将'N/A'和'NO CLUE' 转成通常的nan类型
看看83这行数据是什么意思，然后决定怎么处理
将所有数据转成string类型

In [3]:

requests['Incident Zip'].unique()

Out[3]:

array([11432.0, 11378.0, 10032.0, 10023.0, 10027.0, 11372.0, 11419.0,
       11417.0, 10011.0, 11225.0, 11218.0, 10003.0, 10029.0, 10466.0,
       11219.0, 10025.0, 10310.0, 11236.0, nan, 10033.0, 11216.0, 10016.0,
       10305.0, 10312.0, 10026.0, 10309.0, 10036.0, 11433.0, 11235.0,
       11213.0, 11379.0, 11101.0, 10014.0, 11231.0, 11234.0, 10457.0,
       10459.0, 10465.0, 11207.0, 10002.0, 10034.0, 11233.0, 10453.0,
       10456.0, 10469.0, 11374.0, 11221.0, 11421.0, 11215.0, 10007.0,
       10019.0, 11205.0, 11418.0, 11369.0, 11249.0, 10005.0, 10009.0,
       11211.0, 11412.0, 10458.0, 11229.0, 10065.0, 10030.0, 11222.0,
       10024.0, 10013.0, 11420.0, 11365.0, 10012.0, 11214.0, 11212.0,
       10022.0, 11232.0, 11040.0, 11226.0, 10281.0, 11102.0, 11208.0,
       10001.0, 10472.0, 11414.0, 11223.0, 10040.0, 11220.0, 11373.0,
       11203.0, 11691.0, 11356.0, 10017.0, 10452.0, 10280.0, 11217.0,
       10031.0, 11201.0, 11358.0, 10128.0, 11423.0, 10039.0, 10010.0,
       11209.0, 10021.0, 10037.0, 11413.0, 11375.0, 11238.0, 10473.0,
       11103.0, 11354.0, 11361.0, 11106.0, 11385.0, 10463.0, 10467.0,
       11204.0, 11237.0, 11377.0, 11364.0, 11434.0, 11435.0, 11210.0,
       11228.0, 11368.0, 11694.0, 10464.0, 11415.0, 10314.0, 10301.0,
       10018.0, 10038.0, 11105.0, 11230.0, 10468.0, 11104.0, 10471.0,
       11416.0, 10075.0, 11422.0, 11355.0, 10028.0, 10462.0, 10306.0,
       10461.0, 11224.0, 11429.0, 10035.0, 11366.0, 11362.0, 11206.0,
       10460.0, 10304.0, 11360.0, 11411.0, 10455.0, 10475.0, 10069.0,
       10303.0, 10308.0, 10302.0, 11357.0, 10470.0, 11367.0, 11370.0,
       10454.0, 10451.0, 11436.0, 11426.0, 10153.0, 11004.0, 11428.0,
       11427.0, 11001.0, 11363.0, 10004.0, 10474.0, 11430.0, 10000.0,
       10307.0, 11239.0, 10119.0, 10006.0, 10048.0, 11697.0, 11692.0,
       11693.0, 10573.0, 83.0, 11559.0, 10020.0, 77056.0, 11776.0, 70711.0,
       10282.0, 11109.0, 10044.0, '10452', '11233', '10468', '10310',
       '11105', '10462', '10029', '10301', '10457', '10467', '10469',
       '11225', '10035', '10031', '11226', '10454', '11221', '10025',
       '11229', '11235', '11422', '10472', '11208', '11102', '10032',
       '11216', '10473', '10463', '11213', '10040', '10302', '11231',
       '10470', '11204', '11104', '11212', '10466', '11416', '11214',
       '10009', '11692', '11385', '11423', '11201', '10024', '11435',
       '10312', '10030', '11106', '10033', '10303', '11215', '11222',
       '11354', '10016', '10034', '11420', '10304', '10019', '11237',
       '11249', '11230', '11372', '11207', '11378', '11419', '11361',
       '10011', '11357', '10012', '11358', '10003', '10002', '11374',
       '10007', '11234', '10065', '11369', '11434', '11205', '11206',
       '11415', '11236', '11218', '11413', '10458', '11101', '10306',
       '11355', '10023', '11368', '10314', '11421', '10010', '10018',
       '11223', '10455', '11377', '11433', '11375', '10037', '11209',
       '10459', '10128', '10014', '10282', '11373', '10451', '11238',
       '11211', '10038', '11694', '11203', '11691', '11232', '10305',
       '10021', '11228', '10036', '10001', '10017', '11217', '11219',
       '10308', '10465', '11379', '11414', '10460', '11417', '11220',
       '11366', '10027', '11370', '10309', '11412', '11356', '10456',
       '11432', '10022', '10013', '11367', '11040', '10026', '10475',
       '11210', '11364', '11426', '10471', '10119', '11224', '11418',
       '11429', '11365', '10461', '11239', '10039', '00083', '11411',
       '10075', '11004', '11360', '10453', '10028', '11430', '10307',
       '11103', '10004', '10069', '10005', '10474', '11428', '11436',
       '10020', '11001', '11362', '11693', '10464', '11427', '10044',
       '11363', '10006', '10000', '02061', '77092-2016', '10280', '11109',
       '14225', '55164-0737', '19711', '07306', '000000', 'NO CLUE',
       '90010', '10281', '11747', '23541', '11776', '11697', '11788',
       '07604', 10112.0, 11788.0, 11563.0, 11580.0, 7087.0, 11042.0,
       7093.0, 11501.0, 92123.0, 0.0, 11575.0, 7109.0, 11797.0, '10803',
       '11716', '11722', '11549-3650', '10162', '92123', '23502', '11518',
       '07020', '08807', '11577', '07114', '11003', '07201', '11563',
       '61702', '10103', '29616-0759', '35209-3114', '11520', '11735',
       '10129', '11005', '41042', '11590', 6901.0, 7208.0, 11530.0,
       13221.0, 10954.0, 11735.0, 10103.0, 7114.0, 11111.0, 10107.0], dtype=object)

7.3 处理nan 以及string/float 类型转换¶

可以直接将传递参数na_values到read_csv函数来处理这些异常值。此外指定‘Incident Zip’字段为str类型。

In [4]:

na_values = ['NO CLUE', 'N/A', '0']
requests = pd.read_csv('../data/311-service-requests.csv', na_values=na_values, dtype={'Incident Zip': str})

In [5]:

requests['Incident Zip'].unique()

Out[5]:

array(['11432', '11378', '10032', '10023', '10027', '11372', '11419',
       '11417', '10011', '11225', '11218', '10003', '10029', '10466',
       '11219', '10025', '10310', '11236', nan, '10033', '11216', '10016',
       '10305', '10312', '10026', '10309', '10036', '11433', '11235',
       '11213', '11379', '11101', '10014', '11231', '11234', '10457',
       '10459', '10465', '11207', '10002', '10034', '11233', '10453',
       '10456', '10469', '11374', '11221', '11421', '11215', '10007',
       '10019', '11205', '11418', '11369', '11249', '10005', '10009',
       '11211', '11412', '10458', '11229', '10065', '10030', '11222',
       '10024', '10013', '11420', '11365', '10012', '11214', '11212',
       '10022', '11232', '11040', '11226', '10281', '11102', '11208',
       '10001', '10472', '11414', '11223', '10040', '11220', '11373',
       '11203', '11691', '11356', '10017', '10452', '10280', '11217',
       '10031', '11201', '11358', '10128', '11423', '10039', '10010',
       '11209', '10021', '10037', '11413', '11375', '11238', '10473',
       '11103', '11354', '11361', '11106', '11385', '10463', '10467',
       '11204', '11237', '11377', '11364', '11434', '11435', '11210',
       '11228', '11368', '11694', '10464', '11415', '10314', '10301',
       '10018', '10038', '11105', '11230', '10468', '11104', '10471',
       '11416', '10075', '11422', '11355', '10028', '10462', '10306',
       '10461', '11224', '11429', '10035', '11366', '11362', '11206',
       '10460', '10304', '11360', '11411', '10455', '10475', '10069',
       '10303', '10308', '10302', '11357', '10470', '11367', '11370',
       '10454', '10451', '11436', '11426', '10153', '11004', '11428',
       '11427', '11001', '11363', '10004', '10474', '11430', '10000',
       '10307', '11239', '10119', '10006', '10048', '11697', '11692',
       '11693', '10573', '00083', '11559', '10020', '77056', '11776',
       '70711', '10282', '11109', '10044', '02061', '77092-2016', '14225',
       '55164-0737', '19711', '07306', '000000', '90010', '11747', '23541',
       '11788', '07604', '10112', '11563', '11580', '07087', '11042',
       '07093', '11501', '92123', '00000', '11575', '07109', '11797',
       '10803', '11716', '11722', '11549-3650', '10162', '23502', '11518',
       '07020', '08807', '11577', '07114', '11003', '07201', '61702',
       '10103', '29616-0759', '35209-3114', '11520', '11735', '10129',
       '11005', '41042', '11590', '06901', '07208', '11530', '13221',
       '10954', '11111', '10107'], dtype=object)

7.4 有横线的数据是个什么情况?¶

In [6]:

rows_with_dashes = requests['Incident Zip'].str.contains('-').fillna(False)
len(requests[rows_with_dashes])

Out[6]:

In [7]:

requests[rows_with_dashes]

Out[7]:

	Unique Key	Created Date	Closed Date	Agency	Agency Name	Complaint Type	Descriptor	Location Type	Incident Zip	Incident Address	Street Name	Cross Street 1	Cross Street 2	Intersection Street 1	Intersection Street 2	Address Type	City	Landmark	Facility Type	Status	Due Date	Resolution Action Updated Date	Community Board	Borough	X Coordinate (State Plane)	Y Coordinate (State Plane)	Park Facility Name	Park Borough	School Name	School Number	School Region	School Code	School Phone Number	School Address	School City	School State	School Zip	School Not Found	School or Citywide Complaint	Vehicle Type	Taxi Company Borough	Taxi Pick Up Location	Bridge Highway Name	Bridge Highway Direction	Road Ramp	Bridge Highway Segment	Garage Lot Name	Ferry Direction	Ferry Terminal Name	Latitude	Longitude	Location
29136	26550551	10/24/2013 06:16:34 PM	NaN	DCA	Department of Consumer Affairs	Consumer Complaint	False Advertising	NaN	77092-2016	2700 EAST SELTICE WAY	EAST SELTICE WAY	NaN	NaN	NaN	NaN	NaN	HOUSTON	NaN	NaN	Assigned	11/13/2013 11:15:20 AM	10/29/2013 11:16:16 AM	0 Unspecified	Unspecified	NaN	NaN	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
30939	26548831	10/24/2013 09:35:10 AM	NaN	DCA	Department of Consumer Affairs	Consumer Complaint	Harassment	NaN	55164-0737	P.O. BOX 64437	64437	NaN	NaN	NaN	NaN	NaN	ST. PAUL	NaN	NaN	Assigned	11/13/2013 02:30:21 PM	10/29/2013 02:31:06 PM	0 Unspecified	Unspecified	NaN	NaN	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
70539	26488417	10/15/2013 03:40:33 PM	NaN	TLC	Taxi and Limousine Commission	Taxi Complaint	Driver Complaint	Street	11549-3650	365 HOFSTRA UNIVERSITY	HOFSTRA UNIVERSITY	NaN	NaN	NaN	NaN	NaN	HEMSTEAD	NaN	NaN	Assigned	11/30/2013 01:20:33 PM	10/16/2013 01:21:39 PM	0 Unspecified	Unspecified	NaN	NaN	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	N	NaN	NaN	NaN	La Guardia Airport	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
85821	26468296	10/10/2013 12:36:43 PM	10/26/2013 01:07:07 AM	DCA	Department of Consumer Affairs	Consumer Complaint	Debt Not Owed	NaN	29616-0759	PO BOX 25759	BOX 25759	NaN	NaN	NaN	NaN	NaN	GREENVILLE	NaN	NaN	Closed	10/26/2013 09:20:28 AM	10/26/2013 01:07:07 AM	0 Unspecified	Unspecified	NaN	NaN	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
89304	26461137	10/09/2013 05:23:46 PM	10/25/2013 01:06:41 AM	DCA	Department of Consumer Affairs	Consumer Complaint	Harassment	NaN	35209-3114	600 BEACON PKWY	BEACON PKWY	NaN	NaN	NaN	NaN	NaN	BIRMINGHAM	NaN	NaN	Closed	10/25/2013 02:43:42 PM	10/25/2013 01:06:41 AM	0 Unspecified	Unspecified	NaN	NaN	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

我一开始以为是数据确实，然后直接将它用nan代替了：

requests['Incident Zip'][rows_with_dashes] = np.nan

不过我朋友Dave告诉我那些9位数的邮编是正常的。现在先找出这些邮编，然后去掉后面的部分只保留前5位。

In [9]:

long_zip_codes = requests['Incident Zip'].str.len() > 5
requests['Incident Zip'][long_zip_codes].unique()

Out[9]:

array(['77092-2016', '55164-0737', '000000', '11549-3650', '29616-0759',
       '35209-3114'], dtype=object)

Those all look okay to truncate to me.

In [10]:

requests['Incident Zip'] = requests['Incident Zip'].str.slice(0, 5)

搞定。

之前我以为00083是个错误的邮编，不过后来发现中央公园的邮编居然是00083！涨姿势了。先在最关心的还是00000 这样的编码。先看下这些数据吧：

In [11]:

requests[requests['Incident Zip'] == '00000']

Out[11]:

	Unique Key	Created Date	Closed Date	Agency	Agency Name	Complaint Type	Descriptor	Location Type	Incident Zip	Incident Address	Street Name	Cross Street 1	Cross Street 2	Intersection Street 1	Intersection Street 2	Address Type	City	Landmark	Facility Type	Status	Due Date	Resolution Action Updated Date	Community Board	Borough	X Coordinate (State Plane)	Y Coordinate (State Plane)	Park Facility Name	Park Borough	School Name	School Number	School Region	School Code	School Phone Number	School Address	School City	School State	School Zip	School Not Found	School or Citywide Complaint	Vehicle Type	Taxi Company Borough	Taxi Pick Up Location	Bridge Highway Name	Bridge Highway Direction	Road Ramp	Bridge Highway Segment	Garage Lot Name	Ferry Direction	Ferry Terminal Name	Latitude	Longitude	Location
42600	26529313	10/22/2013 02:51:06 PM	NaN	TLC	Taxi and Limousine Commission	Taxi Complaint	Driver Complaint	NaN	00000	EWR EWR	EWR	NaN	NaN	NaN	NaN	NaN	NEWARK	NaN	NaN	Assigned	12/07/2013 09:53:51 AM	10/23/2013 09:54:43 AM	0 Unspecified	Unspecified	NaN	NaN	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	N	NaN	NaN	NaN	Other	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
60843	26507389	10/17/2013 05:48:44 PM	NaN	TLC	Taxi and Limousine Commission	Taxi Complaint	Driver Complaint	Street	00000	1 NEWARK AIRPORT	NEWARK AIRPORT	NaN	NaN	NaN	NaN	NaN	NEWARK	NaN	NaN	Assigned	12/02/2013 11:59:46 AM	10/18/2013 12:01:08 PM	0 Unspecified	Unspecified	NaN	NaN	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	N	NaN	NaN	NaN	Other	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

也看不出什么名堂，就全都设为nan吧。

In [12]:

zero_zips = requests['Incident Zip'] == '00000'
requests.loc[zero_zips, 'Incident Zip'] = np.nan

现在看下处理后的结果

In [13]:

unique_zips = requests['Incident Zip'].unique()
unique_zips.sort()
unique_zips

Out[13]:

array([nan, '00083', '02061', '06901', '07020', '07087', '07093', '07109',
       '07114', '07201', '07208', '07306', '07604', '08807', '10000',
       '10001', '10002', '10003', '10004', '10005', '10006', '10007',
       '10009', '10010', '10011', '10012', '10013', '10014', '10016',
       '10017', '10018', '10019', '10020', '10021', '10022', '10023',
       '10024', '10025', '10026', '10027', '10028', '10029', '10030',
       '10031', '10032', '10033', '10034', '10035', '10036', '10037',
       '10038', '10039', '10040', '10044', '10048', '10065', '10069',
       '10075', '10103', '10107', '10112', '10119', '10128', '10129',
       '10153', '10162', '10280', '10281', '10282', '10301', '10302',
       '10303', '10304', '10305', '10306', '10307', '10308', '10309',
       '10310', '10312', '10314', '10451', '10452', '10453', '10454',
       '10455', '10456', '10457', '10458', '10459', '10460', '10461',
       '10462', '10463', '10464', '10465', '10466', '10467', '10468',
       '10469', '10470', '10471', '10472', '10473', '10474', '10475',
       '10573', '10803', '10954', '11001', '11003', '11004', '11005',
       '11040', '11042', '11101', '11102', '11103', '11104', '11105',
       '11106', '11109', '11111', '11201', '11203', '11204', '11205',
       '11206', '11207', '11208', '11209', '11210', '11211', '11212',
       '11213', '11214', '11215', '11216', '11217', '11218', '11219',
       '11220', '11221', '11222', '11223', '11224', '11225', '11226',
       '11228', '11229', '11230', '11231', '11232', '11233', '11234',
       '11235', '11236', '11237', '11238', '11239', '11249', '11354',
       '11355', '11356', '11357', '11358', '11360', '11361', '11362',
       '11363', '11364', '11365', '11366', '11367', '11368', '11369',
       '11370', '11372', '11373', '11374', '11375', '11377', '11378',
       '11379', '11385', '11411', '11412', '11413', '11414', '11415',
       '11416', '11417', '11418', '11419', '11420', '11421', '11422',
       '11423', '11426', '11427', '11428', '11429', '11430', '11432',
       '11433', '11434', '11435', '11436', '11501', '11518', '11520',
       '11530', '11549', '11559', '11563', '11575', '11577', '11580',
       '11590', '11691', '11692', '11693', '11694', '11697', '11716',
       '11722', '11735', '11747', '11776', '11788', '11797', '13221',
       '14225', '19711', '23502', '23541', '29616', '35209', '41042',
       '55164', '61702', '70711', '77056', '77092', '90010', '92123'], dtype=object)

这样看起来干净多了。不过还是有点比较奇怪，我在google map上查了下邮编77056，居然在德克萨斯。

再来一起瞧瞧：

In [14]:

zips = requests['Incident Zip']
# Let's say the zips starting with '0' and '1' are okay, for now. (this isn't actually true -- 13221 is in Syracuse, and why?)
is_close = zips.str.startswith('0') | zips.str.startswith('1')
# There are a bunch of NaNs, but we're not interested in them right now, so we'll say they're False
is_far = ~(is_close) & zips.notnull()

In [15]:

zips[is_far]

Out[15]:

12102    77056
13450    70711
29136    77092
30939    55164
44008    90010
47048    23541
57636    92123
71001    92123
71834    23502
80573    61702
85821    29616
89304    35209
94201    41042
Name: Incident Zip, dtype: object

In [16]:

requests[is_far][['Incident Zip', 'Descriptor', 'City']].sort('Incident Zip')

Out[16]:

	Incident Zip	Descriptor	City
71834	23502	Harassment	NORFOLK
47048	23541	Harassment	NORFOLK
85821	29616	Debt Not Owed	GREENVILLE
89304	35209	Harassment	BIRMINGHAM
94201	41042	Harassment	FLORENCE
30939	55164	Harassment	ST. PAUL
80573	61702	Billing Dispute	BLOOMIGTON
13450	70711	Contract Dispute	CLIFTON
12102	77056	Debt Not Owed	HOUSTON
29136	77092	False Advertising	HOUSTON
44008	90010	Billing Dispute	LOS ANGELES
57636	92123	Harassment	SAN DIEGO
71001	92123	Billing Dispute	SAN DIEGO

确实有来自洛杉矶和休斯顿的请求！显然，仅仅通过过滤邮编来处理还是不够，还需要查看下city字段。

In [17]:

requests['City'].str.upper().value_counts()

Out[17]:

BROOKLYN            31662
NEW YORK            22664
BRONX               18438
STATEN ISLAND        4766
JAMAICA              2246
FLUSHING             1803
ASTORIA              1568
RIDGEWOOD            1073
CORONA                707
OZONE PARK            693
LONG ISLAND CITY      678
FAR ROCKAWAY          652
ELMHURST              647
WOODSIDE              609
EAST ELMHURST         562
...
MELVILLE                  1
PORT JEFFERSON STATION    1
NORWELL                   1
EAST ROCKAWAY             1
BIRMINGHAM                1
ROSLYN                    1
LOS ANGELES               1
MINEOLA                   1
JERSEY CITY               1
ST. PAUL                  1
CLIFTON                   1
COL.ANVURES               1
EDGEWATER                 1
ROSELYN                   1
CENTRAL ISLIP             1
Length: 100, dtype: int64

看起来这些数据是正常的，那就随它去吧～

7.5 把所有的代码放在一起¶

下面就是处理邮编字段的所有代码了：

In [53]:

na_values = ['NO CLUE', 'N/A', '0']
requests = pd.read_csv('../data/311-service-requests.csv', 
                       na_values=na_values, 
                       dtype={'Incident Zip': str})

In [54]:

def fix_zip_codes(zips):
    # Truncate everything to length 5 
    zips = zips.str.slice(0, 5)
    
    # Set 00000 zip codes to nan
    zero_zips = zips == '00000'
    zips[zero_zips] = np.nan
    
    return zips

In [55]:

requests['Incident Zip'] = fix_zip_codes(requests['Incident Zip'])

In [56]:

requests['Incident Zip'].unique()

Out[56]:

array(['11432', '11378', '10032', '10023', '10027', '11372', '11419',
       '11417', '10011', '11225', '11218', '10003', '10029', '10466',
       '11219', '10025', '10310', '11236', nan, '10033', '11216', '10016',
       '10305', '10312', '10026', '10309', '10036', '11433', '11235',
       '11213', '11379', '11101', '10014', '11231', '11234', '10457',
       '10459', '10465', '11207', '10002', '10034', '11233', '10453',
       '10456', '10469', '11374', '11221', '11421', '11215', '10007',
       '10019', '11205', '11418', '11369', '11249', '10005', '10009',
       '11211', '11412', '10458', '11229', '10065', '10030', '11222',
       '10024', '10013', '11420', '11365', '10012', '11214', '11212',
       '10022', '11232', '11040', '11226', '10281', '11102', '11208',
       '10001', '10472', '11414', '11223', '10040', '11220', '11373',
       '11203', '11691', '11356', '10017', '10452', '10280', '11217',
       '10031', '11201', '11358', '10128', '11423', '10039', '10010',
       '11209', '10021', '10037', '11413', '11375', '11238', '10473',
       '11103', '11354', '11361', '11106', '11385', '10463', '10467',
       '11204', '11237', '11377', '11364', '11434', '11435', '11210',
       '11228', '11368', '11694', '10464', '11415', '10314', '10301',
       '10018', '10038', '11105', '11230', '10468', '11104', '10471',
       '11416', '10075', '11422', '11355', '10028', '10462', '10306',
       '10461', '11224', '11429', '10035', '11366', '11362', '11206',
       '10460', '10304', '11360', '11411', '10455', '10475', '10069',
       '10303', '10308', '10302', '11357', '10470', '11367', '11370',
       '10454', '10451', '11436', '11426', '10153', '11004', '11428',
       '11427', '11001', '11363', '10004', '10474', '11430', '10000',
       '10307', '11239', '10119', '10006', '10048', '11697', '11692',
       '11693', '10573', '00083', '11559', '10020', '77056', '11776',
       '70711', '10282', '11109', '10044', '02061', '77092', '14225',
       '55164', '19711', '07306', '90010', '11747', '23541', '11788',
       '07604', '10112', '11563', '11580', '07087', '11042', '07093',
       '11501', '92123', '11575', '07109', '11797', '10803', '11716',
       '11722', '11549', '10162', '23502', '11518', '07020', '08807',
       '11577', '07114', '11003', '07201', '61702', '10103', '29616',
       '35209', '11520', '11735', '10129', '11005', '41042', '11590',
       '06901', '07208', '11530', '13221', '10954', '11111', '10107'], dtype=object)