# 导入库
# 将 numpy 和 pandas 导入并命名为np、pd
import numpy as np
import pandas as pd
# 使用相对路径导入csv数据,并
df = pd.read_csv('train.csv')
# 展示数据的前三行 —— 观察数据
print(df.head(3))
PassengerId Survived Pclass \
0 1 0 3
1 2 1 1
2 3 1 3
Name Sex Age SibSp \
0 Braund, Mr. Owen Harris male 22.0 1
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1
2 Heikkinen, Miss. Laina female 26.0 0
Parch Ticket Fare Cabin Embarked
0 0 A/5 21171 7.2500 NaN S
1 0 PC 17599 71.2833 C85 C
2 0 STON/O2. 3101282 7.9250 NaN S
# 导入数据,(names)重命名列名和(index_col)行索引名,并(header=0)忽略原始列名
df = pd.read_csv('train.csv', names=['乘客ID','是否幸存','仓位等级','姓名','性别','年龄','兄弟姐妹个数','父母子女个数','船票信息','票价','客舱','登船港口'],index_col='乘客ID',header=0)
# 展示数据的前三行 —— 观察数据
print(df.head(3))
是否幸存 仓位等级 姓名 性别 \
乘客ID
1 0 3 Braund, Mr. Owen Harris male
2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female
3 1 3 Heikkinen, Miss. Laina female
年龄 兄弟姐妹个数 父母子女个数 船票信息 票价 客舱 登船港口
乘客ID
1 22.0 1 0 A/5 21171 7.2500 NaN S
2 38.0 1 0 PC 17599 71.2833 C85 C
3 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
# 查看数据的基本信息(info) 每一列的非空值的个数、数据类型、文件大小 —— 观察数据
print(df.info())
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 是否幸存 891 non-null int64
1 仓位等级 891 non-null int64
2 姓名 891 non-null object
3 性别 891 non-null object
4 年龄 714 non-null float64
5 兄弟姐妹个数 891 non-null int64
6 父母子女个数 891 non-null int64
7 船票信息 891 non-null object
8 票价 891 non-null float64
9 客舱 204 non-null object
10 登船港口 889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB
None
# 观察前(head)10行和后(tail)10行 —— 观察数据
print(df.head(10))
print(df.tail(10))
是否幸存 仓位等级 姓名 性别 \
乘客ID
1 0 3 Braund, Mr. Owen Harris male
2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female
3 1 3 Heikkinen, Miss. Laina female
4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female
5 0 3 Allen, Mr. William Henry male
6 0 3 Moran, Mr. James male
7 0 1 McCarthy, Mr. Timothy J male
8 0 3 Palsson, Master. Gosta Leonard male
9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female
10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female
年龄 兄弟姐妹个数 父母子女个数 船票信息 票价 客舱 登船港口
乘客ID
1 22.0 1 0 A/5 21171 7.2500 NaN S
2 38.0 1 0 PC 17599 71.2833 C85 C
3 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
4 35.0 1 0 113803 53.1000 C123 S
5 35.0 0 0 373450 8.0500 NaN S
6 NaN 0 0 330877 8.4583 NaN Q
7 54.0 0 0 17463 51.8625 E46 S
8 2.0 3 1 349909 21.0750 NaN S
9 27.0 0 2 347742 11.1333 NaN S
10 14.0 1 0 237736 30.0708 NaN C
是否幸存 仓位等级 姓名 性别 年龄 \
乘客ID
882 0 3 Markun, Mr. Johann male 33.0
883 0 3 Dahlberg, Miss. Gerda Ulrika female 22.0
884 0 2 Banfield, Mr. Frederick James male 28.0
885 0 3 Sutehall, Mr. Henry Jr male 25.0
886 0 3 Rice, Mrs. William (Margaret Norton) female 39.0
887 0 2 Montvila, Rev. Juozas male 27.0
888 1 1 Graham, Miss. Margaret Edith female 19.0
889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN
890 1 1 Behr, Mr. Karl Howell male 26.0
891 0 3 Dooley, Mr. Patrick male 32.0
兄弟姐妹个数 父母子女个数 船票信息 票价 客舱 登船港口
乘客ID
882 0 0 349257 7.8958 NaN S
883 0 0 7552 10.5167 NaN S
884 0 0 C.A./SOTON 34068 10.5000 NaN S
885 0 0 SOTON/OQ 392076 7.0500 NaN S
886 0 5 382652 29.1250 NaN Q
887 0 0 211536 13.0000 NaN S
888 0 0 112053 30.0000 B42 S
889 1 2 W./C. 6607 23.4500 NaN S
890 0 0 111369 30.0000 C148 C
891 0 0 370376 7.7500 NaN Q
# 判断数据是否为空 (返回一个true/false) —— 观察数据
df.isnull()
是否幸存 | 仓位等级 | 姓名 | 性别 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 船票信息 | 票价 | 客舱 | 登船港口 | |
---|---|---|---|---|---|---|---|---|---|---|---|
乘客ID | |||||||||||
1 | False | False | False | False | False | False | False | False | False | True | False |
2 | False | False | False | False | False | False | False | False | False | False | False |
3 | False | False | False | False | False | False | False | False | False | True | False |
4 | False | False | False | False | False | False | False | False | False | False | False |
5 | False | False | False | False | False | False | False | False | False | True | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
887 | False | False | False | False | False | False | False | False | False | True | False |
888 | False | False | False | False | False | False | False | False | False | False | False |
889 | False | False | False | False | True | False | False | False | False | True | False |
890 | False | False | False | False | False | False | False | False | False | False | False |
891 | False | False | False | False | False | False | False | False | False | True | False |
891 rows × 11 columns
# 另存为当前的以改变数据(to csv)为csv —— 保存数据
df.to_csv('tain_cn.csv')
# 查询 对票价和年龄降序排序后的前10行
print(df.sort_values(by=['票价','年龄'],ascending=False).head(10))
'''
根据常识我知道发现票价越高的应该客舱越好,
所以我们会明显看出,票价前10的乘客中存活的有8人,
这是相当高的一个比例,后期可以尝试分析票价和年龄的关系,票价和存活率的关系
'''
是否幸存 仓位等级 姓名 性别 \
乘客ID
680 1 1 Cardeza, Mr. Thomas Drake Martinez male
259 1 1 Ward, Miss. Anna female
738 1 1 Lesurer, Mr. Gustave J male
439 0 1 Fortune, Mr. Mark male
342 1 1 Fortune, Miss. Alice Elizabeth female
89 1 1 Fortune, Miss. Mabel Helen female
28 0 1 Fortune, Mr. Charles Alexander male
743 1 1 Ryerson, Miss. Susan Parker "Suzette" female
312 1 1 Ryerson, Miss. Emily Borie female
300 1 1 Baxter, Mrs. James (Helene DeLaudeniere Chaput) female
年龄 兄弟姐妹个数 父母子女个数 船票信息 票价 客舱 登船港口
乘客ID
680 36.0 0 1 PC 17755 512.3292 B51 B53 B55 C
259 35.0 0 0 PC 17755 512.3292 NaN C
738 35.0 0 0 PC 17755 512.3292 B101 C
439 64.0 1 4 19950 263.0000 C23 C25 C27 S
342 24.0 3 2 19950 263.0000 C23 C25 C27 S
89 23.0 3 2 19950 263.0000 C23 C25 C27 S
28 19.0 3 2 19950 263.0000 C23 C25 C27 S
743 21.0 2 2 PC 17608 262.3750 B57 B59 B63 B66 C
312 18.0 2 2 PC 17608 262.3750 B57 B59 B63 B66 C
300 50.0 0 1 PC 17558 247.5208 B58 B60 C
'\n根据常识我知道发现票价越高的应该客舱越好,\n所以我们会明显看出,票价前10的乘客中存活的有8人,\n这是相当高的一个比例,后期可以尝试分析票价和年龄的关系,票价和存活率的关系\n'
# 查询票价的描述性统计信息
print(df['票价'].describe())
'''
一共有891个票价数据,
平均值约为:32.20,
标准差约为49.69,说明票价波动特别大,
25%的人的票价是低于7.91的,50%的人的票价低于14.45,75%的人的票价低于31.00,
票价最大值约为512.33,最小值为0。
'''
count 891.000000
mean 32.204208
std 49.693429
min 0.000000
25% 7.910400
50% 14.454200
75% 31.000000
max 512.329200
Name: 票价, dtype: float64
'\n一共有891个票价数据,\n平均值约为:32.20,\n标准差约为49.69,说明票价波动特别大,\n25%的人的票价是低于7.91的,50%的人的票价低于14.45,75%的人的票价低于31.00,\n票价最大值约为512.33,最小值为0。\n'
# 对仓位等级和存活进行降序 查看前10行和后10行数据
print(df.sort_values(by=['仓位等级','是否幸存'],ascending=False).head(10))
print(df.sort_values(by=['仓位等级','是否幸存'],ascending=False).tail(10))
'''
通过观察可以发现,前10仓位等级最高者(3级),全部幸存,
后10仓位等级最低者(1级),全部未能幸免
后期可以尝试探索仓位等级与存活的关系
'''
# 可以进行更多的排序观察,如年龄,性别等,并观察其描述性统计
是否幸存 仓位等级 姓名 性别 \
乘客ID
3 1 3 Heikkinen, Miss. Laina female
9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female
11 1 3 Sandstrom, Miss. Marguerite Rut female
20 1 3 Masselmani, Mrs. Fatima female
23 1 3 McGowan, Miss. Anna "Annie" female
26 1 3 Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... female
29 1 3 O'Dwyer, Miss. Ellen "Nellie" female
33 1 3 Glynn, Miss. Mary Agatha female
37 1 3 Mamee, Mr. Hanna male
40 1 3 Nicola-Yarred, Miss. Jamila female
年龄 兄弟姐妹个数 父母子女个数 船票信息 票价 客舱 登船港口
乘客ID
3 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
9 27.0 0 2 347742 11.1333 NaN S
11 4.0 1 1 PP 9549 16.7000 G6 S
20 NaN 0 0 2649 7.2250 NaN C
23 15.0 0 0 330923 8.0292 NaN Q
26 38.0 1 5 347077 31.3875 NaN S
29 NaN 0 0 330959 7.8792 NaN Q
33 NaN 0 0 335677 7.7500 NaN Q
37 NaN 0 0 2677 7.2292 NaN C
40 14.0 1 0 2651 11.2417 NaN C
是否幸存 仓位等级 姓名 性别 年龄 兄弟姐妹个数 \
乘客ID
749 0 1 Marvin, Mr. Daniel Warner male 19.0 1
767 0 1 Brewe, Dr. Arthur Jackson male NaN 0
783 0 1 Long, Mr. Milton Clyde male 29.0 0
790 0 1 Guggenheim, Mr. Benjamin male 46.0 0
794 0 1 Hoyt, Mr. William Fisher male NaN 0
807 0 1 Andrews, Mr. Thomas Jr male 39.0 0
816 0 1 Fry, Mr. Richard male NaN 0
823 0 1 Reuchlin, Jonkheer. John George male 38.0 0
868 0 1 Roebling, Mr. Washington Augustus II male 31.0 0
873 0 1 Carlsson, Mr. Frans Olof male 33.0 0
父母子女个数 船票信息 票价 客舱 登船港口
乘客ID
749 0 113773 53.1000 D30 S
767 0 112379 39.6000 NaN C
783 0 113501 30.0000 D6 S
790 0 PC 17593 79.2000 B82 B84 C
794 0 PC 17600 30.6958 NaN C
807 0 112050 0.0000 A36 S
816 0 112058 0.0000 B102 S
823 0 19972 0.0000 NaN S
868 0 PC 17590 50.4958 A24 S
873 0 695 5.0000 B51 B53 B55 S
'\n通过观察可以发现,前10仓位等级最高者(3级),全部幸存,\n后10仓位等级最低者(1级),全部未能幸免\n后期可以尝试探索仓位等级与存活的关系\n'
# 缺失值观察和处理
# 法1:info 返回非空值的个数
print(df.info())
# 法2:计算空缺值个数的和
print(df.isnull().sum())
'''
通过观察可以发现,年龄,客舱,登船港口有缺失值
'''
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 是否幸存 891 non-null int64
1 仓位等级 891 non-null int64
2 姓名 891 non-null object
3 性别 891 non-null object
4 年龄 714 non-null float64
5 兄弟姐妹个数 891 non-null int64
6 父母子女个数 891 non-null int64
7 船票信息 891 non-null object
8 票价 891 non-null float64
9 客舱 204 non-null object
10 登船港口 889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB
None
是否幸存 0
仓位等级 0
姓名 0
性别 0
年龄 177
兄弟姐妹个数 0
父母子女个数 0
船票信息 0
票价 0
客舱 687
登船港口 2
dtype: int64
'\n通过观察可以发现,年龄,客舱,登船港口有缺失值\n'
# 删除缺失值
# 删除含有缺失值的列和行
# df.dropna()
# 找到缺失值并赋值为 0
# df[df['年龄'].isna()] = 0
df.head()
是否幸存 | 仓位等级 | 姓名 | 性别 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 船票信息 | 票价 | 客舱 | 登船港口 | 年龄类别 | 性别类别 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
乘客ID | |||||||||||||
1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 2 | 1 |
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 5 | 2 |
3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 3 | 2 |
4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 4 | 2 |
5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 4 | 1 |
# 填充空值 fillna() 用0来填空所有数值型的空值
df.fillna(0)
是否幸存 | 仓位等级 | 姓名 | 性别 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 船票信息 | 票价 | 客舱 | 登船港口 | |
---|---|---|---|---|---|---|---|---|---|---|---|
乘客ID | |||||||||||
1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | 0 | S |
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | 0 | S |
4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | 0 | S |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | 0 | S |
888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
889 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0000 | 0 | 0 |
890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | 0 | Q |
891 rows × 11 columns
# 查看数据中的重复值
df[df.duplicated()]
是否幸存 | 仓位等级 | 姓名 | 性别 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 船票信息 | 票价 | 客舱 | 登船港口 | |
---|---|---|---|---|---|---|---|---|---|---|---|
乘客ID | |||||||||||
18 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0 |
20 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0 |
27 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0 |
29 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0 |
30 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
860 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0 |
864 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0 |
869 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0 |
879 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0 |
889 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0 |
176 rows × 11 columns
# 清理重复值
df = df.drop_duplicates()
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 715 entries, 1 to 891
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 是否幸存 715 non-null int64
1 仓位等级 715 non-null int64
2 姓名 715 non-null object
3 性别 715 non-null object
4 年龄 715 non-null float64
5 兄弟姐妹个数 715 non-null int64
6 父母子女个数 715 non-null int64
7 船票信息 715 non-null object
8 票价 715 non-null float64
9 客舱 186 non-null object
10 登船港口 713 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 67.0+ KB
# 特征观察与处理
'''
我们对特征进行一下观察,可以把特征大概分为两大类:
数值型特征:Survived ,Pclass, Age ,SibSp, Parch, Fare,
其中Survived, Pclass为离散型数值特征,
Age,SibSp, Parch, Fare为连续型数值特征
文本型特征:Name, Sex, Cabin,Embarked, Ticket,
其中Sex, Cabin, Embarked, Ticket为类别型文本特征。
数值型特征一般可以直接用于模型的训练,
但有时候为了模型的稳定性及鲁棒性会对连续变量进行离散化。
文本型特征往往需要转换成数值型特征才能用于建模分析。
'''
'\n我们对特征进行一下观察,可以把特征大概分为两大类: \n数值型特征:Survived ,Pclass, Age ,SibSp, Parch, Fare,\n其中Survived, Pclass为离散型数值特征,\nAge,SibSp, Parch, Fare为连续型数值特征 \n\n文本型特征:Name, Sex, Cabin,Embarked, Ticket,\n其中Sex, Cabin, Embarked, Ticket为类别型文本特征。\n\n数值型特征一般可以直接用于模型的训练,\n但有时候为了模型的稳定性及鲁棒性会对连续变量进行离散化。\n文本型特征往往需要转换成数值型特征才能用于建模分析。\n'
# 将连续变量Age平均分箱成5个年龄段,并分别用类别变量12345表示
df['年龄类别'] = pd.cut(df['年龄'], 5,labels = [1,2,3,4,5])
df.head()
是否幸存 | 仓位等级 | 姓名 | 性别 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 船票信息 | 票价 | 客舱 | 登船港口 | 年龄类别 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
乘客ID | ||||||||||||
1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 2 |
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 3 |
3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 2 |
4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 3 |
5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 3 |
# #将连续变量Age划分为(0,5] (5,15] (15,30] (30,50] (50,80]五个年龄段,并分别用类别变量12345表示
df['年龄类别'] = pd.cut(df['年龄'],[0,5,15,30,50,80],labels = [1,2,3,4,5])
df.head()
是否幸存 | 仓位等级 | 姓名 | 性别 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 船票信息 | 票价 | 客舱 | 登船港口 | 年龄类别 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
乘客ID | ||||||||||||
1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 3 |
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 4 |
3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 3 |
4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 4 |
5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 4 |
#将连续变量Age按10% 30% 50 70% 90%五个年龄段,并用分类变量12345表示
df['年龄类别'] = pd.qcut(df['年龄'],[0,0.1,0.3,0.5,0.7,0.9],labels = [1,2,3,4,5])
df.head()
是否幸存 | 仓位等级 | 姓名 | 性别 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 船票信息 | 票价 | 客舱 | 登船港口 | 年龄类别 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
乘客ID | ||||||||||||
1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 2 |
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 5 |
3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 3 |
4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 4 |
5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 4 |
print(df['性别'].value_counts()) # 查看'Sex'列中的变量及种类
print(df['登船港口'].value_counts()) # 查看'Cabin'列中的变量及种类
print(df['客舱'].value_counts()) # 查看'Embarked'列中的变量及种类
male 453
female 261
0 1
Name: 性别, dtype: int64
S 554
C 130
Q 28
0 1
Name: 登船港口, dtype: int64
G6 4
C23 C25 C27 4
B96 B98 4
F2 3
F33 3
..
A6 1
C104 1
B39 1
B69 1
0 1
Name: 客舱, Length: 135, dtype: int64
#将类别文本转换为12345
#方法一: replace
# 男1女2
# replace() 方法把字符串中的 old(旧字符串) 替换成 new(新字符串)
df['性别类别'] = df['性别'].replace(['male','female'],[1,2])
df.head()
是否幸存 | 仓位等级 | 姓名 | 性别 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 船票信息 | 票价 | 客舱 | 登船港口 | 年龄类别 | 性别类别 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
乘客ID | |||||||||||||
1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 2 | 1 |
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 5 | 2 |
3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 3 | 2 |
4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 4 | 2 |
5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 4 | 1 |
from sklearn.preprocessing import LabelEncoder
df['客舱'] = LabelEncoder().fit_transform(df['客舱'])
df.head()
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-28-a090f7066f88> in <module>
1 from sklearn.preprocessing import LabelEncoder
----> 2 df['客舱'] = LabelEncoder().fit_transform(df['客舱'])
3 df.head()
/opt/conda/lib/python3.6/site-packages/sklearn/preprocessing/label.py in fit_transform(self, y)
110 """
111 y = column_or_1d(y, warn=True)
--> 112 self.classes_, y = np.unique(y, return_inverse=True)
113 return y
114
<__array_function__ internals> in unique(*args, **kwargs)
/opt/conda/lib/python3.6/site-packages/numpy/lib/arraysetops.py in unique(ar, return_index, return_inverse, return_counts, axis)
259 ar = np.asanyarray(ar)
260 if axis is None:
--> 261 ret = _unique1d(ar, return_index, return_inverse, return_counts)
262 return _unpack_tuple(ret)
263
/opt/conda/lib/python3.6/site-packages/numpy/lib/arraysetops.py in _unique1d(ar, return_index, return_inverse, return_counts)
317
318 if optional_indices:
--> 319 perm = ar.argsort(kind='mergesort' if return_index else 'quicksort')
320 aux = ar[perm]
321 else:
TypeError: '<' not supported between instances of 'str' and 'float'
df['登船港口'] = LabelEncoder().fit_transform(df['登船港口'])
df.head()
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-34-f52d686f7adb> in <module>
----> 1 df['登船港口'] = LabelEncoder().fit_transform(df['登船港口'])
2 df.head()
/opt/conda/lib/python3.6/site-packages/sklearn/preprocessing/label.py in fit_transform(self, y)
110 """
111 y = column_or_1d(y, warn=True)
--> 112 self.classes_, y = np.unique(y, return_inverse=True)
113 return y
114
<__array_function__ internals> in unique(*args, **kwargs)
/opt/conda/lib/python3.6/site-packages/numpy/lib/arraysetops.py in unique(ar, return_index, return_inverse, return_counts, axis)
259 ar = np.asanyarray(ar)
260 if axis is None:
--> 261 ret = _unique1d(ar, return_index, return_inverse, return_counts)
262 return _unpack_tuple(ret)
263
/opt/conda/lib/python3.6/site-packages/numpy/lib/arraysetops.py in _unique1d(ar, return_index, return_inverse, return_counts)
317
318 if optional_indices:
--> 319 perm = ar.argsort(kind='mergesort' if return_index else 'quicksort')
320 aux = ar[perm]
321 else:
TypeError: '<' not supported between instances of 'int' and 'str'
# one hot编码
# 变成01序列 节省计算性能
x = pd.get_dummies(df['年龄'],prefix='年龄')
# 将数据拼接到原始数据上 按列拼接
df = pd.concat([df,x],axis=1)
df
df.to_csv('train_onehot.csv')
for column in ['客舱','登船港口']:
x = pd.get_dummies(df[column],prefix= column)
df = pd.concat([df,x],axis=1)
df.head()
是否幸存 | 仓位等级 | 姓名 | 性别 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 船票信息 | 票价 | 客舱 | ... | 客舱_F G73 | 客舱_F2 | 客舱_F33 | 客舱_F4 | 客舱_G6 | 客舱_T | 登船港口_0 | 登船港口_C | 登船港口_Q | 登船港口_S | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
乘客ID | |||||||||||||||||||||
1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
5 rows × 643 columns
# series.str.extract 字符串提取表达式
# pat 正则表达式
# flags
df['Title'] = df.姓名.str.extract('([A-Za-z]+)\.')
df
是否幸存 | 仓位等级 | 姓名 | 性别 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 船票信息 | 票价 | 客舱 | ... | 客舱_F2 | 客舱_F33 | 客舱_F4 | 客舱_G6 | 客舱_T | 登船港口_0 | 登船港口_C | 登船港口_Q | 登船港口_S | Title | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
乘客ID | |||||||||||||||||||||
1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Mr |
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | Mrs |
3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Miss |
4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Mrs |
5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Mr |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 0 | 3 | Rice, Mrs. William (Margaret Norton) | female | 39.0 | 0 | 5 | 382652 | 29.1250 | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | Mrs |
887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Rev |
888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Miss |
890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | Mr |
891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | Mr |
715 rows × 644 columns
df.to_csv('test_fin.csv')