In [1]:
# The usual preamble
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Make the graphs a bit prettier, and bigger
pd.set_option('display.mpl_style', 'default')
plt.rcParams['figure.figsize'] = (15, 5)
plt.rcParams['font.family'] = 'sans-serif'
# This is necessary to show lots of columns in pandas 0.12.
# Not necessary in pandas 0.13.
pd.set_option('display.width', 5000)
pd.set_option('display.max_columns', 60)
关于脏数据最头疼的一个问题是,我怎么知道数据是“脏”的?
接下来我们会继续使用NYC 311数据,因为它足够大并且处理起来有点棘手。
In [2]:
requests = pd.read_csv('../data/311-service-requests.csv')
/home/tianjun/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (8) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows)
7.1 怎么知道它是“脏”数据?¶
接下来选取其中的几列来看看,我已经知道其中的邮编有问题,所以先看看这个。
为了知道是不是有问题,我先用.unique()来列出其所有值。如果这个字段是一个数值的话,我一般会画个直方图来看看它的分布。
查看“Incident Zip”之后,显然这个字段是有许多问题的:
- 其中有些是浮点型数据,另一些却转成了字符串类型
- 其中还包含有
nan型的数据 - 某些编码包含比较长的
29616-0759或是只有很短的两位83 - 还有些空值并没有被pandas处理,而是转成了字符串类型比如'N/A' 和'NO CLUE'
我们要做的是:
- 将'N/A'和'NO CLUE' 转成通常的nan类型
- 看看83这行数据是什么意思,然后决定怎么处理
- 将所有数据转成string类型
In [3]:
requests['Incident Zip'].unique()
Out[3]:
array([11432.0, 11378.0, 10032.0, 10023.0, 10027.0, 11372.0, 11419.0,
11417.0, 10011.0, 11225.0, 11218.0, 10003.0, 10029.0, 10466.0,
11219.0, 10025.0, 10310.0, 11236.0, nan, 10033.0, 11216.0, 10016.0,
10305.0, 10312.0, 10026.0, 10309.0, 10036.0, 11433.0, 11235.0,
11213.0, 11379.0, 11101.0, 10014.0, 11231.0, 11234.0, 10457.0,
10459.0, 10465.0, 11207.0, 10002.0, 10034.0, 11233.0, 10453.0,
10456.0, 10469.0, 11374.0, 11221.0, 11421.0, 11215.0, 10007.0,
10019.0, 11205.0, 11418.0, 11369.0, 11249.0, 10005.0, 10009.0,
11211.0, 11412.0, 10458.0, 11229.0, 10065.0, 10030.0, 11222.0,
10024.0, 10013.0, 11420.0, 11365.0, 10012.0, 11214.0, 11212.0,
10022.0, 11232.0, 11040.0, 11226.0, 10281.0, 11102.0, 11208.0,
10001.0, 10472.0, 11414.0, 11223.0, 10040.0, 11220.0, 11373.0,
11203.0, 11691.0, 11356.0, 10017.0, 10452.0, 10280.0, 11217.0,
10031.0, 11201.0, 11358.0, 10128.0, 11423.0, 10039.0, 10010.0,
11209.0, 10021.0, 10037.0, 11413.0, 11375.0, 11238.0, 10473.0,
11103.0, 11354.0, 11361.0, 11106.0, 11385.0, 10463.0, 10467.0,
11204.0, 11237.0, 11377.0, 11364.0, 11434.0, 11435.0, 11210.0,
11228.0, 11368.0, 11694.0, 10464.0, 11415.0, 10314.0, 10301.0,
10018.0, 10038.0, 11105.0, 11230.0, 10468.0, 11104.0, 10471.0,
11416.0, 10075.0, 11422.0, 11355.0, 10028.0, 10462.0, 10306.0,
10461.0, 11224.0, 11429.0, 10035.0, 11366.0, 11362.0, 11206.0,
10460.0, 10304.0, 11360.0, 11411.0, 10455.0, 10475.0, 10069.0,
10303.0, 10308.0, 10302.0, 11357.0, 10470.0, 11367.0, 11370.0,
10454.0, 10451.0, 11436.0, 11426.0, 10153.0, 11004.0, 11428.0,
11427.0, 11001.0, 11363.0, 10004.0, 10474.0, 11430.0, 10000.0,
10307.0, 11239.0, 10119.0, 10006.0, 10048.0, 11697.0, 11692.0,
11693.0, 10573.0, 83.0, 11559.0, 10020.0, 77056.0, 11776.0, 70711.0,
10282.0, 11109.0, 10044.0, '10452', '11233', '10468', '10310',
'11105', '10462', '10029', '10301', '10457', '10467', '10469',
'11225', '10035', '10031', '11226', '10454', '11221', '10025',
'11229', '11235', '11422', '10472', '11208', '11102', '10032',
'11216', '10473', '10463', '11213', '10040', '10302', '11231',
'10470', '11204', '11104', '11212', '10466', '11416', '11214',
'10009', '11692', '11385', '11423', '11201', '10024', '11435',
'10312', '10030', '11106', '10033', '10303', '11215', '11222',
'11354', '10016', '10034', '11420', '10304', '10019', '11237',
'11249', '11230', '11372', '11207', '11378', '11419', '11361',
'10011', '11357', '10012', '11358', '10003', '10002', '11374',
'10007', '11234', '10065', '11369', '11434', '11205', '11206',
'11415', '11236', '11218', '11413', '10458', '11101', '10306',
'11355', '10023', '11368', '10314', '11421', '10010', '10018',
'11223', '10455', '11377', '11433', '11375', '10037', '11209',
'10459', '10128', '10014', '10282', '11373', '10451', '11238',
'11211', '10038', '11694', '11203', '11691', '11232', '10305',
'10021', '11228', '10036', '10001', '10017', '11217', '11219',
'10308', '10465', '11379', '11414', '10460', '11417', '11220',
'11366', '10027', '11370', '10309', '11412', '11356', '10456',
'11432', '10022', '10013', '11367', '11040', '10026', '10475',
'11210', '11364', '11426', '10471', '10119', '11224', '11418',
'11429', '11365', '10461', '11239', '10039', '00083', '11411',
'10075', '11004', '11360', '10453', '10028', '11430', '10307',
'11103', '10004', '10069', '10005', '10474', '11428', '11436',
'10020', '11001', '11362', '11693', '10464', '11427', '10044',
'11363', '10006', '10000', '02061', '77092-2016', '10280', '11109',
'14225', '55164-0737', '19711', '07306', '000000', 'NO CLUE',
'90010', '10281', '11747', '23541', '11776', '11697', '11788',
'07604', 10112.0, 11788.0, 11563.0, 11580.0, 7087.0, 11042.0,
7093.0, 11501.0, 92123.0, 0.0, 11575.0, 7109.0, 11797.0, '10803',
'11716', '11722', '11549-3650', '10162', '92123', '23502', '11518',
'07020', '08807', '11577', '07114', '11003', '07201', '11563',
'61702', '10103', '29616-0759', '35209-3114', '11520', '11735',
'10129', '11005', '41042', '11590', 6901.0, 7208.0, 11530.0,
13221.0, 10954.0, 11735.0, 10103.0, 7114.0, 11111.0, 10107.0], dtype=object)
7.3 处理nan 以及string/float 类型转换¶
可以直接将传递参数na_values到read_csv函数来处理这些异常值。此外指定‘Incident Zip’字段为str类型。
In [4]:
na_values = ['NO CLUE', 'N/A', '0']
requests = pd.read_csv('../data/311-service-requests.csv', na_values=na_values, dtype={'Incident Zip': str})
In [5]:
requests['Incident Zip'].unique()
Out[5]:
array(['11432', '11378', '10032', '10023', '10027', '11372', '11419',
'11417', '10011', '11225', '11218', '10003', '10029', '10466',
'11219', '10025', '10310', '11236', nan, '10033', '11216', '10016',
'10305', '10312', '10026', '10309', '10036', '11433', '11235',
'11213', '11379', '11101', '10014', '11231', '11234', '10457',
'10459', '10465', '11207', '10002', '10034', '11233', '10453',
'10456', '10469', '11374', '11221', '11421', '11215', '10007',
'10019', '11205', '11418', '11369', '11249', '10005', '10009',
'11211', '11412', '10458', '11229', '10065', '10030', '11222',
'10024', '10013', '11420', '11365', '10012', '11214', '11212',
'10022', '11232', '11040', '11226', '10281', '11102', '11208',
'10001', '10472', '11414', '11223', '10040', '11220', '11373',
'11203', '11691', '11356', '10017', '10452', '10280', '11217',
'10031', '11201', '11358', '10128', '11423', '10039', '10010',
'11209', '10021', '10037', '11413', '11375', '11238', '10473',
'11103', '11354', '11361', '11106', '11385', '10463', '10467',
'11204', '11237', '11377', '11364', '11434', '11435', '11210',
'11228', '11368', '11694', '10464', '11415', '10314', '10301',
'10018', '10038', '11105', '11230', '10468', '11104', '10471',
'11416', '10075', '11422', '11355', '10028', '10462', '10306',
'10461', '11224', '11429', '10035', '11366', '11362', '11206',
'10460', '10304', '11360', '11411', '10455', '10475', '10069',
'10303', '10308', '10302', '11357', '10470', '11367', '11370',
'10454', '10451', '11436', '11426', '10153', '11004', '11428',
'11427', '11001', '11363', '10004', '10474', '11430', '10000',
'10307', '11239', '10119', '10006', '10048', '11697', '11692',
'11693', '10573', '00083', '11559', '10020', '77056', '11776',
'70711', '10282', '11109', '10044', '02061', '77092-2016', '14225',
'55164-0737', '19711', '07306', '000000', '90010', '11747', '23541',
'11788', '07604', '10112', '11563', '11580', '07087', '11042',
'07093', '11501', '92123', '00000', '11575', '07109', '11797',
'10803', '11716', '11722', '11549-3650', '10162', '23502', '11518',
'07020', '08807', '11577', '07114', '11003', '07201', '61702',
'10103', '29616-0759', '35209-3114', '11520', '11735', '10129',
'11005', '41042', '11590', '06901', '07208', '11530', '13221',
'10954', '11111', '10107'], dtype=object)
7.4 有横线的数据是个什么情况?¶
In [6]:
rows_with_dashes = requests['Incident Zip'].str.contains('-').fillna(False)
len(requests[rows_with_dashes])
Out[6]:
5
In [7]:
requests[rows_with_dashes]
Out[7]:
| Unique Key | Created Date | Closed Date | Agency | Agency Name | Complaint Type | Descriptor | Location Type | Incident Zip | Incident Address | Street Name | Cross Street 1 | Cross Street 2 | Intersection Street 1 | Intersection Street 2 | Address Type | City | Landmark | Facility Type | Status | Due Date | Resolution Action Updated Date | Community Board | Borough | X Coordinate (State Plane) | Y Coordinate (State Plane) | Park Facility Name | Park Borough | School Name | School Number | School Region | School Code | School Phone Number | School Address | School City | School State | School Zip | School Not Found | School or Citywide Complaint | Vehicle Type | Taxi Company Borough | Taxi Pick Up Location | Bridge Highway Name | Bridge Highway Direction | Road Ramp | Bridge Highway Segment | Garage Lot Name | Ferry Direction | Ferry Terminal Name | Latitude | Longitude | Location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 29136 | 26550551 | 10/24/2013 06:16:34 PM | NaN | DCA | Department of Consumer Affairs | Consumer Complaint | False Advertising | NaN | 77092-2016 | 2700 EAST SELTICE WAY | EAST SELTICE WAY | NaN | NaN | NaN | NaN | NaN | HOUSTON | NaN | NaN | Assigned | 11/13/2013 11:15:20 AM | 10/29/2013 11:16:16 AM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 30939 | 26548831 | 10/24/2013 09:35:10 AM | NaN | DCA | Department of Consumer Affairs | Consumer Complaint | Harassment | NaN | 55164-0737 | P.O. BOX 64437 | 64437 | NaN | NaN | NaN | NaN | NaN | ST. PAUL | NaN | NaN | Assigned | 11/13/2013 02:30:21 PM | 10/29/2013 02:31:06 PM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 70539 | 26488417 | 10/15/2013 03:40:33 PM | NaN | TLC | Taxi and Limousine Commission | Taxi Complaint | Driver Complaint | Street | 11549-3650 | 365 HOFSTRA UNIVERSITY | HOFSTRA UNIVERSITY | NaN | NaN | NaN | NaN | NaN | HEMSTEAD | NaN | NaN | Assigned | 11/30/2013 01:20:33 PM | 10/16/2013 01:21:39 PM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | La Guardia Airport | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 85821 | 26468296 | 10/10/2013 12:36:43 PM | 10/26/2013 01:07:07 AM | DCA | Department of Consumer Affairs | Consumer Complaint | Debt Not Owed | NaN | 29616-0759 | PO BOX 25759 | BOX 25759 | NaN | NaN | NaN | NaN | NaN | GREENVILLE | NaN | NaN | Closed | 10/26/2013 09:20:28 AM | 10/26/2013 01:07:07 AM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 89304 | 26461137 | 10/09/2013 05:23:46 PM | 10/25/2013 01:06:41 AM | DCA | Department of Consumer Affairs | Consumer Complaint | Harassment | NaN | 35209-3114 | 600 BEACON PKWY | BEACON PKWY | NaN | NaN | NaN | NaN | NaN | BIRMINGHAM | NaN | NaN | Closed | 10/25/2013 02:43:42 PM | 10/25/2013 01:06:41 AM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
我一开始以为是数据确实,然后直接将它用nan代替了:
requests['Incident Zip'][rows_with_dashes] = np.nan
不过我朋友Dave告诉我那些9位数的邮编是正常的。现在先找出这些邮编,然后去掉后面的部分只保留前5位。
In [9]:
long_zip_codes = requests['Incident Zip'].str.len() > 5
requests['Incident Zip'][long_zip_codes].unique()
Out[9]:
array(['77092-2016', '55164-0737', '000000', '11549-3650', '29616-0759',
'35209-3114'], dtype=object)
Those all look okay to truncate to me.
In [10]:
requests['Incident Zip'] = requests['Incident Zip'].str.slice(0, 5)
搞定。
之前我以为00083是个错误的邮编,不过后来发现中央公园的邮编居然是00083!涨姿势了。先在最关心的还是00000 这样的编码。先看下这些数据吧:
In [11]:
requests[requests['Incident Zip'] == '00000']
Out[11]:
| Unique Key | Created Date | Closed Date | Agency | Agency Name | Complaint Type | Descriptor | Location Type | Incident Zip | Incident Address | Street Name | Cross Street 1 | Cross Street 2 | Intersection Street 1 | Intersection Street 2 | Address Type | City | Landmark | Facility Type | Status | Due Date | Resolution Action Updated Date | Community Board | Borough | X Coordinate (State Plane) | Y Coordinate (State Plane) | Park Facility Name | Park Borough | School Name | School Number | School Region | School Code | School Phone Number | School Address | School City | School State | School Zip | School Not Found | School or Citywide Complaint | Vehicle Type | Taxi Company Borough | Taxi Pick Up Location | Bridge Highway Name | Bridge Highway Direction | Road Ramp | Bridge Highway Segment | Garage Lot Name | Ferry Direction | Ferry Terminal Name | Latitude | Longitude | Location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 42600 | 26529313 | 10/22/2013 02:51:06 PM | NaN | TLC | Taxi and Limousine Commission | Taxi Complaint | Driver Complaint | NaN | 00000 | EWR EWR | EWR | NaN | NaN | NaN | NaN | NaN | NEWARK | NaN | NaN | Assigned | 12/07/2013 09:53:51 AM | 10/23/2013 09:54:43 AM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | Other | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 60843 | 26507389 | 10/17/2013 05:48:44 PM | NaN | TLC | Taxi and Limousine Commission | Taxi Complaint | Driver Complaint | Street | 00000 | 1 NEWARK AIRPORT | NEWARK AIRPORT | NaN | NaN | NaN | NaN | NaN | NEWARK | NaN | NaN | Assigned | 12/02/2013 11:59:46 AM | 10/18/2013 12:01:08 PM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | Other | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
也看不出什么名堂,就全都设为nan吧。
In [12]:
zero_zips = requests['Incident Zip'] == '00000'
requests.loc[zero_zips, 'Incident Zip'] = np.nan
现在看下处理后的结果
In [13]:
unique_zips = requests['Incident Zip'].unique()
unique_zips.sort()
unique_zips
Out[13]:
array([nan, '00083', '02061', '06901', '07020', '07087', '07093', '07109',
'07114', '07201', '07208', '07306', '07604', '08807', '10000',
'10001', '10002', '10003', '10004', '10005', '10006', '10007',
'10009', '10010', '10011', '10012', '10013', '10014', '10016',
'10017', '10018', '10019', '10020', '10021', '10022', '10023',
'10024', '10025', '10026', '10027', '10028', '10029', '10030',
'10031', '10032', '10033', '10034', '10035', '10036', '10037',
'10038', '10039', '10040', '10044', '10048', '10065', '10069',
'10075', '10103', '10107', '10112', '10119', '10128', '10129',
'10153', '10162', '10280', '10281', '10282', '10301', '10302',
'10303', '10304', '10305', '10306', '10307', '10308', '10309',
'10310', '10312', '10314', '10451', '10452', '10453', '10454',
'10455', '10456', '10457', '10458', '10459', '10460', '10461',
'10462', '10463', '10464', '10465', '10466', '10467', '10468',
'10469', '10470', '10471', '10472', '10473', '10474', '10475',
'10573', '10803', '10954', '11001', '11003', '11004', '11005',
'11040', '11042', '11101', '11102', '11103', '11104', '11105',
'11106', '11109', '11111', '11201', '11203', '11204', '11205',
'11206', '11207', '11208', '11209', '11210', '11211', '11212',
'11213', '11214', '11215', '11216', '11217', '11218', '11219',
'11220', '11221', '11222', '11223', '11224', '11225', '11226',
'11228', '11229', '11230', '11231', '11232', '11233', '11234',
'11235', '11236', '11237', '11238', '11239', '11249', '11354',
'11355', '11356', '11357', '11358', '11360', '11361', '11362',
'11363', '11364', '11365', '11366', '11367', '11368', '11369',
'11370', '11372', '11373', '11374', '11375', '11377', '11378',
'11379', '11385', '11411', '11412', '11413', '11414', '11415',
'11416', '11417', '11418', '11419', '11420', '11421', '11422',
'11423', '11426', '11427', '11428', '11429', '11430', '11432',
'11433', '11434', '11435', '11436', '11501', '11518', '11520',
'11530', '11549', '11559', '11563', '11575', '11577', '11580',
'11590', '11691', '11692', '11693', '11694', '11697', '11716',
'11722', '11735', '11747', '11776', '11788', '11797', '13221',
'14225', '19711', '23502', '23541', '29616', '35209', '41042',
'55164', '61702', '70711', '77056', '77092', '90010', '92123'], dtype=object)
这样看起来干净多了。不过还是有点比较奇怪,我在google map上查了下邮编77056,居然在德克萨斯。
再来一起瞧瞧:
In [14]:
zips = requests['Incident Zip']
# Let's say the zips starting with '0' and '1' are okay, for now. (this isn't actually true -- 13221 is in Syracuse, and why?)
is_close = zips.str.startswith('0') | zips.str.startswith('1')
# There are a bunch of NaNs, but we're not interested in them right now, so we'll say they're False
is_far = ~(is_close) & zips.notnull()
In [15]:
zips[is_far]
Out[15]:
12102 77056 13450 70711 29136 77092 30939 55164 44008 90010 47048 23541 57636 92123 71001 92123 71834 23502 80573 61702 85821 29616 89304 35209 94201 41042 Name: Incident Zip, dtype: object
In [16]:
requests[is_far][['Incident Zip', 'Descriptor', 'City']].sort('Incident Zip')
Out[16]:
| Incident Zip | Descriptor | City | |
|---|---|---|---|
| 71834 | 23502 | Harassment | NORFOLK |
| 47048 | 23541 | Harassment | NORFOLK |
| 85821 | 29616 | Debt Not Owed | GREENVILLE |
| 89304 | 35209 | Harassment | BIRMINGHAM |
| 94201 | 41042 | Harassment | FLORENCE |
| 30939 | 55164 | Harassment | ST. PAUL |
| 80573 | 61702 | Billing Dispute | BLOOMIGTON |
| 13450 | 70711 | Contract Dispute | CLIFTON |
| 12102 | 77056 | Debt Not Owed | HOUSTON |
| 29136 | 77092 | False Advertising | HOUSTON |
| 44008 | 90010 | Billing Dispute | LOS ANGELES |
| 57636 | 92123 | Harassment | SAN DIEGO |
| 71001 | 92123 | Billing Dispute | SAN DIEGO |
确实有来自洛杉矶和休斯顿的请求!显然,仅仅通过过滤邮编来处理还是不够,还需要查看下city字段。
In [17]:
requests['City'].str.upper().value_counts()
Out[17]:
BROOKLYN 31662 NEW YORK 22664 BRONX 18438 STATEN ISLAND 4766 JAMAICA 2246 FLUSHING 1803 ASTORIA 1568 RIDGEWOOD 1073 CORONA 707 OZONE PARK 693 LONG ISLAND CITY 678 FAR ROCKAWAY 652 ELMHURST 647 WOODSIDE 609 EAST ELMHURST 562 ... MELVILLE 1 PORT JEFFERSON STATION 1 NORWELL 1 EAST ROCKAWAY 1 BIRMINGHAM 1 ROSLYN 1 LOS ANGELES 1 MINEOLA 1 JERSEY CITY 1 ST. PAUL 1 CLIFTON 1 COL.ANVURES 1 EDGEWATER 1 ROSELYN 1 CENTRAL ISLIP 1 Length: 100, dtype: int64
看起来这些数据是正常的,那就随它去吧~
7.5 把所有的代码放在一起¶
下面就是处理邮编字段的所有代码了:
In [53]:
na_values = ['NO CLUE', 'N/A', '0']
requests = pd.read_csv('../data/311-service-requests.csv',
na_values=na_values,
dtype={'Incident Zip': str})
In [54]:
def fix_zip_codes(zips):
# Truncate everything to length 5
zips = zips.str.slice(0, 5)
# Set 00000 zip codes to nan
zero_zips = zips == '00000'
zips[zero_zips] = np.nan
return zips
In [55]:
requests['Incident Zip'] = fix_zip_codes(requests['Incident Zip'])
In [56]:
requests['Incident Zip'].unique()
Out[56]:
array(['11432', '11378', '10032', '10023', '10027', '11372', '11419',
'11417', '10011', '11225', '11218', '10003', '10029', '10466',
'11219', '10025', '10310', '11236', nan, '10033', '11216', '10016',
'10305', '10312', '10026', '10309', '10036', '11433', '11235',
'11213', '11379', '11101', '10014', '11231', '11234', '10457',
'10459', '10465', '11207', '10002', '10034', '11233', '10453',
'10456', '10469', '11374', '11221', '11421', '11215', '10007',
'10019', '11205', '11418', '11369', '11249', '10005', '10009',
'11211', '11412', '10458', '11229', '10065', '10030', '11222',
'10024', '10013', '11420', '11365', '10012', '11214', '11212',
'10022', '11232', '11040', '11226', '10281', '11102', '11208',
'10001', '10472', '11414', '11223', '10040', '11220', '11373',
'11203', '11691', '11356', '10017', '10452', '10280', '11217',
'10031', '11201', '11358', '10128', '11423', '10039', '10010',
'11209', '10021', '10037', '11413', '11375', '11238', '10473',
'11103', '11354', '11361', '11106', '11385', '10463', '10467',
'11204', '11237', '11377', '11364', '11434', '11435', '11210',
'11228', '11368', '11694', '10464', '11415', '10314', '10301',
'10018', '10038', '11105', '11230', '10468', '11104', '10471',
'11416', '10075', '11422', '11355', '10028', '10462', '10306',
'10461', '11224', '11429', '10035', '11366', '11362', '11206',
'10460', '10304', '11360', '11411', '10455', '10475', '10069',
'10303', '10308', '10302', '11357', '10470', '11367', '11370',
'10454', '10451', '11436', '11426', '10153', '11004', '11428',
'11427', '11001', '11363', '10004', '10474', '11430', '10000',
'10307', '11239', '10119', '10006', '10048', '11697', '11692',
'11693', '10573', '00083', '11559', '10020', '77056', '11776',
'70711', '10282', '11109', '10044', '02061', '77092', '14225',
'55164', '19711', '07306', '90010', '11747', '23541', '11788',
'07604', '10112', '11563', '11580', '07087', '11042', '07093',
'11501', '92123', '11575', '07109', '11797', '10803', '11716',
'11722', '11549', '10162', '23502', '11518', '07020', '08807',
'11577', '07114', '11003', '07201', '61702', '10103', '29616',
'35209', '11520', '11735', '10129', '11005', '41042', '11590',
'06901', '07208', '11530', '13221', '10954', '11111', '10107'], dtype=object)