数据常用筛选方法
- 在数据中,选择需要的行或者列
- 基础索引方式,就是直接引用
- ioc[行索引名称或者条件,列索引名称或者标签]
- iloc[行索引位置,列索引位置]
import pandas as pd
import os
import numpy as
os.getcwd()
'D:\\Jupyter\\notebook\\Python数据清洗实战\\数据清洗之数据表处理'
os.chdir('D:\\Jupyter\\notebook\\Python数据清洗实战\\数据')
df = pd.read_csv('baby_trade_history.csv', encoding='utf-8', dtype={'user_id':str})
df
user_id | auction_id | cat_id | cat1 | property | buy_mount | day | |
0 | 786295544 | 41098319944 | 50014866 | 50022520 | 21458:86755362;13023209:3593274;10984217:21985... | 2 | 20140919 |
1 | 532110457 | 17916191097 | 50011993 | 28 | 21458:11399317;1628862:3251296;21475:137325;16... | 1 | 20131011 |
2 | 249013725 | 21896936223 | 50012461 | 50014815 | 21458:30992;1628665:92012;1628665:3233938;1628... | 1 | 20131011 |
3 | 917056007 | 12515996043 | 50018831 | 50014815 | 21458:15841995;21956:3494076;27000458:59723383... | 2 | 20141023 |
4 | 444069173 | 20487688075 | 50013636 | 50008168 | 21458:30992;13658074:3323064;1628665:3233941;1... | 1 | 20141103 |
5 | 152298847 | 41840167463 | 121394024 | 50008168 | 21458:3408353;13023209:727117752;22009:2741771... | 1 | 20141103 |
6 | 513441334 | 19909384116 | 50010557 | 50008168 | 25935:21991;1628665:29784;22019:34731;22019:20... | 1 | 20121212 |
7 | 297411659 | 13540124907 | 50010542 | 50008168 | 21458:60020529;25935:31381;1633959:27247291;16... | 1 | 20121212 |
8 | 82830661 | 19948600790 | 50013874 | 28 | 21458:11580;21475:137325 | 1 | 20121101 |
9 | 475046636 | 10368360710 | 203527 | 28 | 22724:40168;22729:40278;21458:21817;2770200:24... | 1 | 20121101 |
10 | 734147966 | 15307958346 | 50018202 | 38 | 21458:3270827;7361532:28710594;7397093:7536994... | 2 | 20121101 |
11 | 68547330 | 21162876126 | 50012365 | 122650008 | 1628665:3233941;1628665:3233942;1628665:323393... | 1 | 20121123 |
12 | 697081418 | 15898050723 | 50013636 | 50008168 | 21458:19726868;1633959:179425852;13836282:1290... | 1 | 20121123 |
13 | 377550424 | 15771663914 | 50015841 | 28 | 1628665:3233941;1628665:3233942;3914866:11580;... | 1 | 20121123 |
14 | 88313935 | 22532727492 | 50013711 | 50008168 | 1628665:3233941;1628665:3233942;22019:3340598;... | 1 | 20131005 |
15 | 25918750 | 16078389250 | 50012359 | 122650008 | 21458:3405407;1633959:6186201;1628366:32799;81... | 1 | 20131005 |
16 | 350288528 | 35086271572 | 50010544 | 50008168 | 21458:61813;25935:21991;1628665:3233938;162866... | 1 | 20131129 |
17 | 348090113 | 17436967558 | 50009540 | 50014815 | 21458:21910;3110425:30696849;2191928:75373546;... | 1 | 20131129 |
18 | 1635282280 | 36153356431 | 50013207 | 50008168 | 1628665:29784;1628665:29799;2904342:31004;2201... | 1 | 20131129 |
19 | 530850018 | 22058239899 | 50024147 | 28 | 21458:205007542;43307470:5543413;2339128:62147... | 1 | 20140210 |
20 | 749507708 | 19171641742 | 50018860 | 28 | 21458:3602856;1628665:3233941;1628665:3233942;... | 1 | 20140210 |
21 | 201088567 | 38564176352 | 50013207 | 50008168 | 1628665:3233941;1628665:3233942;1628665:323393... | 1 | 20140502 |
22 | 469517728 | 8232924597 | 211122 | 38 | 21458:21782;36786:42781029;13023102:6999219;22... | 6 | 20140502 |
23 | 691367866 | 17712372914 | 121434042 | 50014815 | 21458:49341152;8021059:5525523;6851452:1398669... | 1 | 20140804 |
24 | 77193822 | 35537441586 | 50006520 | 50014815 | 22277:6262384;21458:30992;1628665:3233941;1628... | 2 | 20140804 |
25 | 605678021 | 15502618744 | 50010555 | 50008168 | 25935:31381;1628665:3233941;1628665:3233942;16... | 1 | 20130226 |
26 | 47702620 | 26481508332 | 121412034 | 50014815 | 21458:49341152;11057903:4036007;130475532:7537... | 1 | 20140918 |
27 | 763560371 | 40945285800 | 50012365 | 122650008 | 21458:30992;1628665:3233939;22007:30338;22007:... | 1 | 20150201 |
28 | 408028533 | 35838498718 | 50012442 | 50008168 | 21458:3596449;6811831:3446999;13023209:3446999... | 1 | 20141009 |
29 | 53566371 | 27177784760 | 121394024 | 50008168 | 21458:42090508;1628665:3233941;1628665:3233942... | 1 | 20141009 |
... | ... | ... | ... | ... | ... | ... | ... |
29941 | 413188001 | 16521677358 | 50012478 | 50014815 | 21458:28155;5434803:3636603;2815901:22583732;1... | 1 | 20130107 |
29942 | 474062095 | 21129724585 | 50013207 | 50008168 | 21458:21599;1628665:29798;1628665:3233938;1628... | 1 | 20130107 |
29943 | 797710454 | 18176728510 | 50013177 | 28 | 1628665:3233941;1628665:3233942;1628665:323393... | 1 | 20130107 |
29944 | 1716505453 | 37844041565 | 50010555 | 50008168 | 21458:30992;25935:31381;1628665:3233941;162866... | 1 | 20141231 |
29945 | 1966692323 | 42504930457 | 50012359 | 122650008 | 21458:3379652;1628665:3233940;1628665:3233938;... | 1 | 20141231 |
29946 | 641734831 | 22105131076 | 50014277 | 50014815 | 21458:21906;13227811:51479;13230966:75369014;3... | 2 | 20141016 |
29947 | 731030177 | 41666438142 | 121394024 | 50008168 | 21458:3443560;1628665:3233942;1628665:3233938;... | 1 | 20141016 |
29948 | 68515755 | 13953276547 | 50012788 | 28 | 21458:12376977;2112993:32075;1628665:92012;162... | 1 | 20130729 |
29949 | 180436843 | 23375100402 | 50012451 | 50008168 | 21458:33514;1633959:13343071;33030:29800;33162... | 1 | 20130729 |
29950 | 801784345 | 17629938386 | 50023670 | 28 | 21458:3550980;29154281:231350353;11684888:1045... | 1 | 20130729 |
29951 | 124458824 | 19739113764 | 50013636 | 50008168 | 21458:30992;13658074:9306734;1628665:3233941;1... | 1 | 20140322 |
29952 | 602141957 | 37251457564 | 50012360 | 122650008 | 21458:21599;1628665:29798;1628665:82340;162866... | 1 | 20140322 |
29953 | 595095853 | 41160643364 | 121364022 | 50008168 | 21458:80090256;1628665:29784;1628665:29796;162... | 1 | 20150111 |
29954 | 1905258237 | 42298652641 | 121452056 | 50008168 | 21458:30992;1628665:3233942;1628665:31614;1628... | 1 | 20150111 |
29955 | 1957645413 | 36768778465 | 121448033 | 38 | 6940834:29865;1628149:137593;21475:114226;2275... | 1 | 20140815 |
29956 | 1854778218 | 37200665444 | 50012361 | 122650008 | 21458:3645338;13023209:544768204;122217803:309... | 1 | 20140815 |
29957 | 268356658 | 36932456353 | 50010236 | 50014815 | 21458:10513072;12474507:706291650;3091143:9208... | 1 | 20141027 |
29958 | 196272909 | 10066997901 | 50009540 | 50014815 | 21458:21906;13229910:32056435;2191928:73664723... | 1 | 20141104 |
29959 | 23473499 | 38019470815 | 50010236 | 50014815 | 1628665:61550;1628665:3233940;1628665:3233936;... | 1 | 20141104 |
29960 | 816394377 | 19835118833 | 50003700 | 28 | 24448:73774385;6725953:48332;22044:30715;80047... | 1 | 20130912 |
29961 | 164859586 | 15842319049 | 50012479 | 28 | NaN | 1 | 20130912 |
29962 | 119149466 | 26396292642 | 50008875 | 28 | 21458:30992;11684888:104528258;21475:11488282;... | 1 | 20130912 |
29963 | 704655047 | 10506866020 | 50007011 | 50008168 | 1628665:3233941;1628665:3233942;1628665:323393... | 1 | 20121206 |
29964 | 45662429 | 20745380642 | 50010555 | 50008168 | 25935:31381;1628665:3233941;1628665:3233942;16... | 1 | 20121206 |
29965 | 35711492 | 16563353438 | 50010544 | 50008168 | 21458:11580;25935:21991;1628665:92012;1628665:... | 1 | 20121206 |
29966 | 57747284 | 35169635909 | 50010549 | 50008168 | 21458:125202070;22019:3228688;22019:3248884;22... | 1 | 20140109 |
29967 | 287541325 | 19778523000 | 50007011 | 50008168 | 21458:112788583;1633959:3523439;3130834:209537... | 2 | 20140109 |
29968 | 82915321 | 12766532512 | 50011993 | 28 | 21475:137325;1628665:3233937;1628665:29798;162... | 1 | 20131008 |
29969 | 78259523 | 18309305134 | 50013711 | 50008168 | 21458:30992;1628665:29778;1628665:29793;163395... | 1 | 20131008 |
29970 | 758305789 | 20177445814 | 50018860 | 28 | 21458:3602856;1628665:29784;1628665:3233941;73... | 1 | 20131008 |
29971 rows × 7 columns
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29971 entries, 0 to 29970
Data columns (total 7 columns):
user_id 29971 non-null object
auction_id 29971 non-null int64
cat_id 29971 non-null int64
cat1 29971 non-null int64
property 29827 non-null object
buy_mount 29971 non-null int64
day 29971 non-null int64
dtypes: int64(5), object(2)
memory usage: 1.6+ MB
df.head(10)
user_id | auction_id | cat_id | cat1 | property | buy_mount | day | |
0 | 786295544 | 41098319944 | 50014866 | 50022520 | 21458:86755362;13023209:3593274;10984217:21985... | 2 | 20140919 |
1 | 532110457 | 17916191097 | 50011993 | 28 | 21458:11399317;1628862:3251296;21475:137325;16... | 1 | 20131011 |
2 | 249013725 | 21896936223 | 50012461 | 50014815 | 21458:30992;1628665:92012;1628665:3233938;1628... | 1 | 20131011 |
3 | 917056007 | 12515996043 | 50018831 | 50014815 | 21458:15841995;21956:3494076;27000458:59723383... | 2 | 20141023 |
4 | 444069173 | 20487688075 | 50013636 | 50008168 | 21458:30992;13658074:3323064;1628665:3233941;1... | 1 | 20141103 |
5 | 152298847 | 41840167463 | 121394024 | 50008168 | 21458:3408353;13023209:727117752;22009:2741771... | 1 | 20141103 |
6 | 513441334 | 19909384116 | 50010557 | 50008168 | 25935:21991;1628665:29784;22019:34731;22019:20... | 1 | 20121212 |
7 | 297411659 | 13540124907 | 50010542 | 50008168 | 21458:60020529;25935:31381;1633959:27247291;16... | 1 | 20121212 |
8 | 82830661 | 19948600790 | 50013874 | 28 | 21458:11580;21475:137325 | 1 | 20121101 |
9 | 475046636 | 10368360710 | 203527 | 28 | 22724:40168;22729:40278;21458:21817;2770200:24... | 1 | 20121101 |
df.columns # 查看数据字段
Index(['user_id', 'auction_id', 'cat_id', 'cat1', 'property', 'buy_mount',
'day'],
dtype='object')
df['user_id'].head(5) # 相当于嵌套列表
0 786295544
1 532110457
2 249013725
3 917056007
4 444069173
Name: user_id, dtype: object
df[['user_id', 'cat1']].head(5)
user_id | cat1 | |
0 | 786295544 | 50022520 |
1 | 532110457 | 28 |
2 | 249013725 | 50014815 |
3 | 917056007 | 50014815 |
4 | 444069173 | 50008168 |
df[['user_id', 'cat1']][1:5] # 分片选择
user_id | cat1 | |
1 | 532110457 | 28 |
2 | 249013725 | 50014815 |
3 | 917056007 | 50014815 |
4 | 444069173 | 50008168 |
df.loc[3:4] # 定位标签,不是位置
user_id | auction_id | cat_id | cat1 | property | buy_mount | day | |
3 | 917056007 | 12515996043 | 50018831 | 50014815 | 21458:15841995;21956:3494076;27000458:59723383... | 2 | 20141023 |
4 | 444069173 | 20487688075 | 50013636 | 50008168 | 21458:30992;13658074:3323064;1628665:3233941;1... | 1 | 20141103 |
# 行不做限制
# 列标签为user_id 和 buiy_mount 字段
df.loc[:,['user_id','buy_mount']].head(5)
user_id | buy_mount | |
0 | 786295544 | 2 |
1 | 532110457 | 1 |
2 | 249013725 | 1 |
3 | 917056007 | 2 |
4 | 444069173 | 1 |
# 行标签为1到3,列标签为user_id 和 buy_mount 字段
df.loc[1:3, ['user_id','buy_mount']]
user_id | buy_mount | |
1 | 532110457 | 1 |
2 | 249013725 | 1 |
3 | 917056007 | 2 |
df.loc[df.user_id=='249013725', ['user_id', 'buy_mount']] # 条件筛选
user_id | buy_mount | |
2 | 249013725 | 1 |
df.loc[(df.user_id=='249013725') | (df.buy_mount>=1000), ['user_id', 'buy_mount']] # 条件取值
user_id | buy_mount | |
2 | 249013725 | 1 |
1164 | 1945590674 | 1500 |
5536 | 2288344467 | 10000 |
6627 | 117730165 | 2800 |
10402 | 32141414 | 1000 |
25675 | 173701616 | 2748 |
# loc选择的是标签,iloc选择的是位置
df.iloc[1:3]
user_id | auction_id | cat_id | cat1 | property | buy_mount | day | |
1 | 532110457 | 17916191097 | 50011993 | 28 | 21458:11399317;1628862:3251296;21475:137325;16... | 1 | 20131011 |
2 | 249013725 | 21896936223 | 50012461 | 50014815 | 21458:30992;1628665:92012;1628665:3233938;1628... | 1 | 20131011 |
df.iloc[1:3, 1:4]
auction_id | cat_id | cat1 | |
1 | 17916191097 | 50011993 | 28 |
2 | 21896936223 | 50012461 | 50014815 |
df.iloc[:, [0, 2]].head(5)
user_id | cat_id | |
0 | 786295544 | 50014866 |
1 | 532110457 | 50011993 |
2 | 249013725 | 50012461 |
3 | 917056007 | 50018831 |
4 | 444069173 | 50013636 |
# 选择第二行和第十一行,第1列和第三列
df.iloc[[1,10], [0,2]]
user_id | cat_id | |
1 | 532110457 | 50011993 |
10 | 734147966 | 50018202 |