0
点赞
收藏
分享

微信扫一扫

pandas5:DataFrame的链接concat,合并merge与删除drop

上善若水的道 2022-09-14 阅读 70


1.链接,concat

1.1行链接

import pandas as

df1 = pd.read_csv('./data/concat_1.csv')
print(df1)
df2 = pd.read_csv('./data/concat_2.csv')
print(df2)
df3 = pd.read_csv('./data/concat_3.csv')
print(df3)

    A   B   C   D
0 a0 b0 c0 d0
1 a1 b1 c1 d1
2 a2 b2 c2 d2
3 a3 b3 c3 d3
A B C D
0 a4 b4 c4 d4
1 a5 b5 c5 d5
2 a6 b6 c6 d6
3 a7 b7 c7 d7
A B C D
0 a8 b8 c8 d8
1 a9 b9 c9 d9
2 a10 b10 c10 d10
3 a11 b11 c11 d11

row_concat = pd.concat([df1, df2, df3])
print(row_concat)

     A    B    C    D
0 a0 b0 c0 d0
1 a1 b1 c1 d1
2 a2 b2 c2 d2
3 a3 b3 c3 d3
0 a4 b4 c4 d4
1 a5 b5 c5 d5
2 a6 b6 c6 d6
3 a7 b7 c7 d7
0 a8 b8 c8 d8
1 a9 b9 c9 d9
2 a10 b10 c10 d10
3 a11 b11 c11 d11

print(row_concat.iloc[4,])
print(row_concat.iloc[4:6,])

A    a4
B b4
C c4
D d4
Name: 0, dtype: object
A B C D
0 a4 b4 c4 d4
1 a5 b5 c5 d5

new_row_series = pd.Series(['n1','n2','n3','n4'])
print(new_row_series)

0    n1
1 n2
2 n3
3 n4
dtype: object

print(pd.concat([df1, new_row_series]))

     A    B    C    D    0
0 a0 b0 c0 d0 NaN
1 a1 b1 c1 d1 NaN
2 a2 b2 c2 d2 NaN
3 a3 b3 c3 d3 NaN
0 NaN NaN NaN NaN n1
1 NaN NaN NaN NaN n2
2 NaN NaN NaN NaN n3
3 NaN NaN NaN NaN n4

new_row_df = pd.DataFrame([['n1','n2','n3','n4']],
columns=['A','B','C','D'])
print(new_row_df)

    A   B   C   D
0 n1 n2 n3 n4

print(pd.concat([df1,new_row_df]))
print(pd.concat([df1,new_row_df],ignore_index=True))

    A   B   C   D
0 a0 b0 c0 d0
1 a1 b1 c1 d1
2 a2 b2 c2 d2
3 a3 b3 c3 d3
0 n1 n2 n3 n4
A B C D
0 a0 b0 c0 d0
1 a1 b1 c1 d1
2 a2 b2 c2 d2
3 a3 b3 c3 d3
4 n1 n2 n3 n4

1.2列拼接

df1 = pd.read_csv('./data/concat_1.csv')
print(df1)
df2 = pd.read_csv('./data/concat_2.csv')
print(df2)
df3 = pd.read_csv('./data/concat_3.csv')
print(df3)

    A   B   C   D
0 a0 b0 c0 d0
1 a1 b1 c1 d1
2 a2 b2 c2 d2
3 a3 b3 c3 d3
A B C D
0 a4 b4 c4 d4
1 a5 b5 c5 d5
2 a6 b6 c6 d6
3 a7 b7 c7 d7
A B C D
0 a8 b8 c8 d8
1 a9 b9 c9 d9
2 a10 b10 c10 d10
3 a11 b11 c11 d11

col_concat = pd.concat([df1, df2, df3],axis=1)
print(col_concat)

    A   B   C   D   A   B   C   D    A    B    C    D
0 a0 b0 c0 d0 a4 b4 c4 d4 a8 b8 c8 d8
1 a1 b1 c1 d1 a5 b5 c5 d5 a9 b9 c9 d9
2 a2 b2 c2 d2 a6 b6 c6 d6 a10 b10 c10 d10
3 a3 b3 c3 d3 a7 b7 c7 d7 a11 b11 c11 d11

print(col_concat['B'])

    B   B    B
0 b0 b4 b8
1 b1 b5 b9
2 b2 b6 b10
3 b3 b7 b11

# ADD NEW COL
col_concat['new_col_list'] = ['n1','n2','n3','n4']
print(col_concat)

    A   B   C   D   A   B   C   D    A    B    C    D new_col_list
0 a0 b0 c0 d0 a4 b4 c4 d4 a8 b8 c8 d8 n1
1 a1 b1 c1 d1 a5 b5 c5 d5 a9 b9 c9 d9 n2
2 a2 b2 c2 d2 a6 b6 c6 d6 a10 b10 c10 d10 n3
3 a3 b3 c3 d3 a7 b7 c7 d7 a11 b11 c11 d11 n4

# 用Series添加一列
col_concat['new_col_series'] = pd.Series(['n1','n2','n3','n4'])
print(col_concat)

    A   B   C   D   A   B   C   D    A    B    C    D new_col_list  \
0 a0 b0 c0 d0 a4 b4 c4 d4 a8 b8 c8 d8 n1
1 a1 b1 c1 d1 a5 b5 c5 d5 a9 b9 c9 d9 n2
2 a2 b2 c2 d2 a6 b6 c6 d6 a10 b10 c10 d10 n3
3 a3 b3 c3 d3 a7 b7 c7 d7 a11 b11 c11 d11 n4

new_col_series
0 n1
1 n2
2 n3
3 n4

col_concat = pd.concat([df1,df2,df3],axis=1,ignore_index=True)
print(col_concat)

   0   1   2   3   4   5   6   7    8    9    10   11
0 a0 b0 c0 d0 a4 b4 c4 d4 a8 b8 c8 d8
1 a1 b1 c1 d1 a5 b5 c5 d5 a9 b9 c9 d9
2 a2 b2 c2 d2 a6 b6 c6 d6 a10 b10 c10 d10
3 a3 b3 c3 d3 a7 b7 c7 d7 a11 b11 c11 d11

print(col_concat[2])

0    c0
1 c1
2 c2
3 c3
Name: 2, dtype: object

print(col_concat[2][1])

c1

print(col_concat[1:3])

   0   1   2   3   4   5   6   7    8    9    10   11
1 a1 b1 c1 d1 a5 b5 c5 d5 a9 b9 c9 d9
2 a2 b2 c2 d2 a6 b6 c6 d6 a10 b10 c10 d10

# 拥有不同列的data_frame
df1 = pd.read_csv('./data/concat_1.csv')
df2 = pd.read_csv('./data/concat_2.csv')
df3 = pd.read_csv('./data/concat_3.csv')

df1.columns = ['A','B','C','D']
df2.columns = ['E','F','G','H']
df3.columns = ['A','C','F','H']

print(df1)
print(df2)
print(df3)

    A   B   C   D
0 a0 b0 c0 d0
1 a1 b1 c1 d1
2 a2 b2 c2 d2
3 a3 b3 c3 d3
E F G H
0 a4 b4 c4 d4
1 a5 b5 c5 d5
2 a6 b6 c6 d6
3 a7 b7 c7 d7
A C F H
0 a8 b8 c8 d8
1 a9 b9 c9 d9
2 a10 b10 c10 d10
3 a11 b11 c11 d11

col_concat = pd.concat([df1,df2,df3])
print(col_concat)

     A    B    C    D    E    F    G    H
0 a0 b0 c0 d0 NaN NaN NaN NaN
1 a1 b1 c1 d1 NaN NaN NaN NaN
2 a2 b2 c2 d2 NaN NaN NaN NaN
3 a3 b3 c3 d3 NaN NaN NaN NaN
0 NaN NaN NaN NaN a4 b4 c4 d4
1 NaN NaN NaN NaN a5 b5 c5 d5
2 NaN NaN NaN NaN a6 b6 c6 d6
3 NaN NaN NaN NaN a7 b7 c7 d7
0 a8 NaN b8 NaN NaN c8 NaN d8
1 a9 NaN b9 NaN NaN c9 NaN d9
2 a10 NaN b10 NaN NaN c10 NaN d10
3 a11 NaN b11 NaN NaN c11 NaN d11


/home/leon/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

"""Entry point for launching an IPython kernel.

2合并,merge

关系型数据库中的两个表之间的链接

# table1.field1 table.field2
# select table1.field1,table2.field2
# where table1.field1 = table2.field

# 1. one vs one
# 2. more vs one
# 3. more vs more

import pandas as

person = pd.read_csv('./data/survey_person.csv')
site = pd.read_csv('./data/survey_site.csv')
survey = pd.read_csv('./data/survey_survey.csv')
visited = pd.read_csv('./data/survey_visited.csv')

print(person)
print(site)
print(survey)
print(visited)

      ident   personal    family
0 dyer William Dyer
1 pb Frank Pabodie
2 lake Anderson Lake
3 roe Valentina Roerich
4 danforth Frank Danforth
name lat long
0 DR-1 -49.85 -128.57
1 DR-3 -47.15 -126.72
2 MSK-4 -48.87 -123.40
taken person quant reading
0 619 dyer rad 9.82
1 619 dyer sal 0.13
2 622 dyer rad 7.80
3 622 dyer sal 0.09
4 734 pb rad 8.41
5 734 lake sal 0.05
6 734 pb temp -21.50
7 735 pb rad 7.22
8 735 NaN sal 0.06
9 735 NaN temp -26.00
10 751 pb rad 4.35
11 751 pb temp -18.50
12 751 lake sal 0.10
13 752 lake rad 2.19
14 752 lake sal 0.09
15 752 lake temp -16.00
16 752 roe sal 41.60
17 837 lake rad 1.46
18 837 lake sal 0.21
19 837 roe sal 22.50
20 844 roe rad 11.25
ident site dated
0 619 DR-1 1927-02-08
1 622 DR-1 1927-02-10
2 734 DR-3 1939-01-07
3 735 DR-3 1930-01-12
4 751 DR-3 1930-02-26
5 752 DR-3 NaN
6 837 MSK-4 1932-01-14
7 844 DR-1 1932-03-22

visited_subset = visited.loc[[0,2],]
print(visited_subset)

   ident  site       dated
0 619 DR-1 1927-02-08
2 734 DR-3 1939-01-07

# one to one
print(site)
print(visited_subset)

    name    lat    long
0 DR-1 -49.85 -128.57
1 DR-3 -47.15 -126.72
2 MSK-4 -48.87 -123.40
ident site dated
0 619 DR-1 1927-02-08
2 734 DR-3 1939-01-07

merge1 = site.merge(visited_subset,left_on = 'name',right_on='site')
print(merge1)

   name    lat    long  ident  site       dated
0 DR-1 -49.85 -128.57 619 DR-1 1927-02-08
1 DR-3 -47.15 -126.72 734 DR-3 1939-01-07

merge多对一

print(site)
print(visited)

    name    lat    long
0 DR-1 -49.85 -128.57
1 DR-3 -47.15 -126.72
2 MSK-4 -48.87 -123.40
ident site dated
0 619 DR-1 1927-02-08
1 622 DR-1 1927-02-10
2 734 DR-3 1939-01-07
3 735 DR-3 1930-01-12
4 751 DR-3 1930-02-26
5 752 DR-3 NaN
6 837 MSK-4 1932-01-14
7 844 DR-1 1932-03-22

merge2 = site.merge(visited,left_on = 'name',right_on='site')
print(merge2)

    name    lat    long  ident   site       dated
0 DR-1 -49.85 -128.57 619 DR-1 1927-02-08
1 DR-1 -49.85 -128.57 622 DR-1 1927-02-10
2 DR-1 -49.85 -128.57 844 DR-1 1932-03-22
3 DR-3 -47.15 -126.72 734 DR-3 1939-01-07
4 DR-3 -47.15 -126.72 735 DR-3 1930-01-12
5 DR-3 -47.15 -126.72 751 DR-3 1930-02-26
6 DR-3 -47.15 -126.72 752 DR-3 NaN
7 MSK-4 -48.87 -123.40 837 MSK-4 1932-01-14

merge多对多

print(person)
print(survey)

      ident   personal    family
0 dyer William Dyer
1 pb Frank Pabodie
2 lake Anderson Lake
3 roe Valentina Roerich
4 danforth Frank Danforth
taken person quant reading
0 619 dyer rad 9.82
1 619 dyer sal 0.13
2 622 dyer rad 7.80
3 622 dyer sal 0.09
4 734 pb rad 8.41
5 734 lake sal 0.05
6 734 pb temp -21.50
7 735 pb rad 7.22
8 735 NaN sal 0.06
9 735 NaN temp -26.00
10 751 pb rad 4.35
11 751 pb temp -18.50
12 751 lake sal 0.10
13 752 lake rad 2.19
14 752 lake sal 0.09
15 752 lake temp -16.00
16 752 roe sal 41.60
17 837 lake rad 1.46
18 837 lake sal 0.21
19 837 roe sal 22.50
20 844 roe rad 11.25

merge3 = person.merge(survey,left_on = 'ident',right_on='person')
print(merge3)

   ident   personal   family  taken person quant  reading
0 dyer William Dyer 619 dyer rad 9.82
1 dyer William Dyer 619 dyer sal 0.13
2 dyer William Dyer 622 dyer rad 7.80
3 dyer William Dyer 622 dyer sal 0.09
4 pb Frank Pabodie 734 pb rad 8.41
5 pb Frank Pabodie 734 pb temp -21.50
6 pb Frank Pabodie 735 pb rad 7.22
7 pb Frank Pabodie 751 pb rad 4.35
8 pb Frank Pabodie 751 pb temp -18.50
9 lake Anderson Lake 734 lake sal 0.05
10 lake Anderson Lake 751 lake sal 0.10
11 lake Anderson Lake 752 lake rad 2.19
12 lake Anderson Lake 752 lake sal 0.09
13 lake Anderson Lake 752 lake temp -16.00
14 lake Anderson Lake 837 lake rad 1.46
15 lake Anderson Lake 837 lake sal 0.21
16 roe Valentina Roerich 752 roe sal 41.60
17 roe Valentina Roerich 837 roe sal 22.50
18 roe Valentina Roerich 844 roe rad 11.25

3.删除drop

import pandas as pd
df = pd.read_csv('./data/survey_visited.csv')
print(df)

   ident   site       dated
0 619 DR-1 1927-02-08
1 622 DR-1 1927-02-10
2 734 DR-3 1939-01-07
3 735 DR-3 1930-01-12
4 751 DR-3 1930-02-26
5 752 DR-3 NaN
6 837 MSK-4 1932-01-14
7 844 DR-1 1932-03-22

2.1删除行

删除单行

df2=df.drop(labels=0)   # axis默认等于0,即按行删除,这里表示按行删除第0行
print(df2)

   ident   site       dated
1 622 DR-1 1927-02-10
2 734 DR-3 1939-01-07
3 735 DR-3 1930-01-12
4 751 DR-3 1930-02-26
5 752 DR-3 NaN
6 837 MSK-4 1932-01-14
7 844 DR-1 1932-03-22

删除多行

df3=df.drop(labels=[1,3],axis=0)   # axis=0 表示按行删除,删除第1行和第3行
print(df)
print(df3)

   ident   site       dated
0 619 DR-1 1927-02-08
1 622 DR-1 1927-02-10
2 734 DR-3 1939-01-07
3 735 DR-3 1930-01-12
4 751 DR-3 1930-02-26
5 752 DR-3 NaN
6 837 MSK-4 1932-01-14
7 844 DR-1 1932-03-22
ident site dated
0 619 DR-1 1927-02-08
2 734 DR-3 1939-01-07
4 751 DR-3 1930-02-26
5 752 DR-3 NaN
6 837 MSK-4 1932-01-14
7 844 DR-1 1932-03-22

要删除连续的多行可以用range(),删除连续的多列不能用此方法

=df.drop(labels=range(1,3),axis=0)   # axis=0 表示按行删除,删除索引值是第1行至第3行的正行数据
print(df4)

   ident   site       dated
0 619 DR-1 1927-02-08
3 735 DR-3 1930-01-12
4 751 DR-3 1930-02-26
5 752 DR-3 NaN
6 837 MSK-4 1932-01-14
7 844 DR-1 1932-03-22

2.2删除列

删除单列

df5=df.drop(labels='site',axis=1)  # axis=1 表示按列删除,删除gender列
print(df)
print(df5)

   ident   site       dated
0 619 DR-1 1927-02-08
1 622 DR-1 1927-02-10
2 734 DR-3 1939-01-07
3 735 DR-3 1930-01-12
4 751 DR-3 1930-02-26
5 752 DR-3 NaN
6 837 MSK-4 1932-01-14
7 844 DR-1 1932-03-22
ident dated
0 619 1927-02-08
1 622 1927-02-10
2 734 1939-01-07
3 735 1930-01-12
4 751 1930-02-26
5 752 NaN
6 837 1932-01-14
7 844 1932-03-22

删除指定的某几列

df6=df.drop(labels=['ident',"site"],axis=1)  # axis=1 表示按列删除,删除gender、age列
print(df)
print(df6)

   ident   site       dated
0 619 DR-1 1927-02-08
1 622 DR-1 1927-02-10
2 734 DR-3 1939-01-07
3 735 DR-3 1930-01-12
4 751 DR-3 1930-02-26
5 752 DR-3 NaN
6 837 MSK-4 1932-01-14
7 844 DR-1 1932-03-22
dated
0 1927-02-08
1 1927-02-10
2 1939-01-07
3 1930-01-12
4 1930-02-26
5 NaN
6 1932-01-14
7 1932-03-22


举报

相关推荐

0 条评论