1.911分类案例_代码示例
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import font_manager
my_font = font_manager.FontProperties(fname="c:/Windows/Fonts/simkai.ttf")
df = pd.read_csv("./911.csv")
# print(df.info())
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249737 entries, 0 to 249736
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 lat 249737 non-null float64
1 lng 249737 non-null float64
2 desc 249737 non-null object
3 zip 219391 non-null float64
4 title 249737 non-null object
5 timeStamp 249737 non-null object
6 twp 249644 non-null object
7 addr 249737 non-null object
8 e 249737 non-null int64
dtypes: float64(3), int64(1), object(5)
memory usage: 17.1+ MB
'''
time_data = pd.DataFrame(df['timeStamp'])
# print(time_data)
'''
timeStamp
0 2015-12-10 17:10:52
1 2015-12-10 17:29:21
2 2015-12-10 14:39:21
... ...
249735 2017-09-20 19:42:05
249736 2017-09-20 19:42:29
[249737 rows x 1 columns]
'''
#获取分类
# print(type(df["title"])) # <class 'pandas.core.series.Series'>
tempt_list = df["title"].str.split(':').tolist()
# print(tempt_list)
'''
RESCUE - ELEVATOR'],
[['Traffic', ' DISABLED VEHICLE -'],
['Fire', ' FIRE ALARM'],
['Traffic', ' ROAD OBSTRUCTION -'],
···
['Fire', ' FIRE INVESTIGATION'],
['EMS', ' UNKNOWN MEDICAL EMERGENCY'],
['Traffic', ' ROAD OBSTRUCTION -']]
'''
# cate_list = set([i[0] for i in tempt_list]) # set()保证元素不重复
# print(cate_list)
'''{'Traffic', 'EMS', 'Fire'} '''
cate_list = list(set([i[0] for i in tempt_list]))
# print(cate_list)
'''['Traffic', 'EMS', 'Fire']'''
#构造全为0的数组
zero_array = pd.DataFrame(np.zeros((df.shape[0], len(cate_list))), columns=cate_list)
# print(zero_array)
'''
EMS Fire Traffic
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
... ... ... ...
249735 0.0 0.0 0.0
249736 0.0 0.0 0.0
[249737 rows x 3 columns]
'''
#赋值
# print(df["title"].str.contains('EMS'))
'''
0 True
1 True
2 False
...
249734 True
249735 False
249736 False
Name: title, Length: 249737, dtype: bool
'''
for cate in cate_list:
# 高效赋值
zero_array[cate][df["title"].str.contains(cate)] = 1
# print(zero_array)
'''
Fire Traffic EMS
0 0.0 0.0 1.0
1 0.0 0.0 1.0
... ... ... ...
249735 1.0 0.0 0.0
249736 0.0 1.0 0.0
[249737 rows x 3 columns]
'''
sum_data = zero_array.sum(axis=0)
# print(sum_data)
'''
EMS 124844.0
Traffic 87465.0
Fire 37432.0
dtype: float64
'''
plt.figure(figsize=(20, 8), dpi=80)
_x = sum_data.keys()
_y = sum_data.values
plt.bar(range(len(_x)), _y, width=0.1)
# 给柱状图标注数值,注意zip(x, y),x的位置不能放_x??
for x, y in zip(range(len(_x)), _y):
plt.text(x, y+0.5, y.astype(int), ha='center', va='bottom')
plt.xticks(range(len(_x)), _x)
plt.show()
2.pandas的时间序列_代码示例
import pandas as pd
import numpy as np
date_list = pd.date_range(start="2021-1-1", end="2022-1-1", freq='M')
# print(date_list)
'''
DatetimeIndex(['2021-01-31', '2021-02-28', '2021-03-31', '2021-04-30',
'2021-05-31', '2021-06-30', '2021-07-31', '2021-08-31',
'2021-09-30', '2021-10-31', '2021-11-30', '2021-12-31'],
dtype='datetime64[ns]', freq='M')
'''
# print(pd.date_range(start="20210101", periods=4, freq="H"))
'''
DatetimeIndex(['2021-01-01 00:00:00', '2021-01-01 01:00:00',
'2021-01-01 02:00:00', '2021-01-01 03:00:00'],
dtype='datetime64[ns]', freq='H')
'''
index = pd.date_range(start="20220101", periods=5)
df = pd.DataFrame(np.random.rand(5), index=index)
# print(df)
'''
0
2022-01-01 0.269013
2022-01-02 0.748808
2022-01-03 0.711873
2022-01-04 0.714188
2022-01-05 0.513025
'''
3.911的时间统计_代码示例
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
df = pd.read_csv("./911.csv")
# print(df.info())
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249737 entries, 0 to 249736
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 lat 249737 non-null float64
1 lng 249737 non-null float64
2 desc 249737 non-null object
3 zip 219391 non-null float64
4 title 249737 non-null object
5 timeStamp 249737 non-null object
6 twp 249644 non-null object
7 addr 249737 non-null object
8 e 249737 non-null int64
dtypes: float64(3), int64(1), object(5)
memory usage: 17.1+ MB
'''
# print(df['timeStamp'])
'''
0 2015-12-10 17:10:52
1 2015-12-10 17:29:21
2 2015-12-10 14:39:21
...
249735 2017-09-20 19:42:05
249736 2017-09-20 19:42:29
Name: timeStamp, Length: 249737, dtype: object
'''
df["timeStamp"] = pd.to_datetime(df["timeStamp"])
# print(df['timeStamp']) # 从object转化为了datetime64
'''
0 2015-12-10 17:10:52
1 2015-12-10 17:29:21
...
249736 2017-09-20 19:42:29
Name: timeStamp, Length: 249737, dtype: datetime64[ns]
'''
df.set_index("timeStamp", inplace=True) # “timeStamp”设置为索引了
# print(df)
'''
lat lng ... addr e
timeStamp ...
2015-12-10 17:10:52 40.297876 -75.581294 ... REINDEER CT & DEAD END 1
2015-12-10 17:29:21 40.258061 -75.264680 ... BRIAR PATH & WHITEMARSH LN 1
... ... ... ... ... ..
2017-09-20 19:42:05 40.196606 -75.423406 ... MILL RD & CASSEL RD 1
2017-09-20 19:42:29 40.095206 -75.410735 ... 1ST AVE & MOORE RD 1
[249737 rows x 8 columns]
'''
#统计出911数据中不同月份电话次数的 ['title']表示取列
count_by_moth = df.resample("M").count()['title']
# print(count_by_moth)
'''
timeStamp
2015-12-31 7916
2016-01-31 13096
···
2017-08-31 11753
2017-09-30 7276
Freq: M, Name: title, dtype: int64
'''
plt.figure(figsize=(20, 8), dpi=60)
_x = count_by_moth.index
_y = count_by_moth.values
_x = [i.strftime('%Y%m%d') for i in _x] # 只取年月日
plt.plot(range(len(_x)), _y)
plt.xticks(range(len(_x)), _x, rotation=45)
plt.show()
4.911的复合统计_代码示例
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
#911数据中不同月份不同类型的电话的次数的变化情况
df = pd.read_csv("./911.csv")
#把时间字符串转为时间类型
df["timeStamp"] = pd.to_datetime(df['timeStamp'])
# print(df["timeStamp"])
'''
0 2015-12-10 17:10:52
1 2015-12-10 17:29:21
...
249736 2017-09-20 19:42:29
Name: timeStamp, Length: 249737, dtype: datetime64[ns]
'''
#表示分类
temp_list = df["title"].str.split(":").tolist()
cate_list = [i[0] for i in temp_list]
# print(cate_list)
'''
['EMS', 'EMS', ··· 'Fire', 'Traffic']
'''
#添加列
df['cate'] = pd.DataFrame(np.array(cate_list).reshape((df.shape[0], 1)))
# 设置时间为索引 (原地改变)
df.set_index("timeStamp", inplace=True)
# print(df['cate'])
'''
timeStamp
2015-12-10 17:10:52 EMS
2015-12-10 17:29:21 EMS
··· ...
2017-09-20 19:42:29 Traffic
Name: cate, Length: 249737, dtype: object
'''
# print(pd.DataFrame(df.groupby(by="cate"))) #??????
'''
0 1
0 EMS lat lng ......
1 Fire lat lng ......
2 Traffic lat lng ......
'''
plt.figure(figsize=(20, 8), dpi=60)
#分组 ????
for group_name, group_data in df.groupby(by="cate"):
# 对不同的分类都进行绘图
count_by_moth = group_data.resample("M").count()['title']
#画图
_x = count_by_moth.index
_y = count_by_moth.values
_x = [i.strftime('%Y%m%d') for i in _x]
plt.plot(range(len(_x)), _y, label=group_name)
plt.xticks(range(len(_x)), _x, rotation=45)
plt.legend(loc="best")
plt.show()
5.pm2.5的统计_代码示例
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
df = pd.read_csv("./PM2.5/BeijingPM20100101_20151231.csv")
# print(df.head(1))
'''
No year month day hour ... TEMP cbwd Iws precipitation Iprec
0 1 2010 1 1 0 ... -11.0 NW 1.79 0.0 0.0
'''
# print(df.info())
'''
[1 rows x 18 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52584 entries, 0 to 52583
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 No 52584 non-null int64
1 year 52584 non-null int64
2 month 52584 non-null int64
3 day 52584 non-null int64
4 hour 52584 non-null int64
5 season 52584 non-null int64
6 PM_Dongsi 25052 non-null float64
7 PM_Dongsihuan 20508 non-null float64
8 PM_Nongzhanguan 24931 non-null float64
9 PM_US Post 50387 non-null float64
10 DEWP 52579 non-null float64
11 HUMI 52245 non-null float64
12 PRES 52245 non-null float64
13 TEMP 52579 non-null float64
14 cbwd 52579 non-null object
15 Iws 52579 non-null float64
16 precipitation 52100 non-null float64
17 Iprec 52100 non-null float64
dtypes: float64(11), int64(6), object(1)
memory usage: 7.2+ MB
'''
#把分开的时间字符串通过periodIndex的方法转化为pandas的时间类型
period = pd.PeriodIndex(year=df['year'], month=df['month'],
day=df['day'], hour=df['hour'], freq="H")
# print(period)
'''
PeriodIndex(['2010-01-01 00:00', '2010-01-01 01:00', '2010-01-01 02:00',
...
015-12-31 21:00', '2015-12-31 22:00','2015-12-31 23:00'],
dtype='period[H]', length=52584, freq='H')
'''
# 设置period列
df['date_time'] =period
#把datetime 设置为索引
df.set_index("date_time", inplace=True)
#进行降采样
df = df.resample("7D").mean()
# print(df.head())
'''
No year month ... Iws precipitation Iprec
date_time ...
2010-01-01 84.5 2010.0 1.000000 ... 43.859821 0.066667 0.786905
2010-01-08 252.5 2010.0 1.000000 ... 45.392083 0.000000 0.000000
2010-01-15 420.5 2010.0 1.000000 ... 17.492976 0.000000 0.000000
2010-01-22 588.5 2010.0 1.000000 ... 54.854048 0.000000 0.000000
2010-01-29 756.5 2010.0 1.571429 ... 26.625119 0.000000 0.000000
[5 rows x 17 columns]
'''
#处理缺失数据,删除缺失数据
# print(df["PM_US Post"])
# data = df["PM_US Post"].drapna() # dropna()删除NaN数据
data = df["PM_US Post"]
data_china = df["PM_Dongsihuan"]
plt.figure(figsize=(20, 8), dpi=60)
_x = data.index
_y = data.values
# _x = [i.strftime("%Y%m%d") for i in _x] # 截取表示的时间需要的话可以用
# _x_china = [i.strftime("%Y%m%d") for i in data_china.index]
_y_china = data_china.values
plt.plot(range(len(_x)), _y, label="us")
plt.plot(range(len(_x)), _y_china, label="cn")
# range 和 list 都可以设置步长
plt.xticks(range(0, len(_x), 10), list(_x)[::10], rotation=45)
plt.legend(loc="best")
plt.show()