练习题2
#给出列表
superheros = [
'Batman',
'Superman',
'Spider-man',
'Iron man',
'Captain America',
'Wonder Woman'
]
strength_levels = (100, 120, 90, 95, 110, 120)
#1.转为series对象(列表)
pd.Series(superheros)
#2.力量值转为series对象
pd.Series(strength_levels)
#3.创建series
heros = pd.Series(
data = strength_levels,
index = superheros
)
heros
#4.
heros.head(2)
#5.
heros.tail(4)
#6.
heros.nunique()
#7.
heros.mean()
#8.
heros.max()
heros.min()
#9.
heros * 2
#10.
dict(heros)
练习题3
import pandas as pd
import datetime as dt
#一周中每天转为星期形式
def day_of_week(day):
return day.strftime('%A')
#战斗最多的一天(列出开始日期)
days_of_war = pd.read_csv(filepath_or_buffer='revolutionary_war.csv',
usecols=['Start Date'],
parse_dates=['Start Date'],
).squeeze(1)
print(type(days_of_war))
print()
days_of_war
#dropna() 方法会删除 Series 中的任何包含 NaN(缺失值)的行
#apply(day_of_week) 会对剩余的每个非空元素应用自定义函数 day_of_week。
#这个操作将把日期转换为相应的星期几,将结果存储在名为 days 的新 Series 中。
days = days_of_war.dropna().apply(day_of_week)
days
#统计不同星期出现的次数
days.value_counts()
练习题4
# 1. 导入nfl.csv,将Birthday转为datetimes
nfl = pd.read_csv('nfl.csv', parse_dates=['Birthday'])
nfl
# 2. 用2种方法将DataFrame的index设置为name
# 第一种方法
nfl = nfl.set_index('Name')
nfl
# 第二种方法
nfl_2 = pd.read_csv('nfl.csv', parse_dates=['Birthday'], index_col='Name')
nfl
# 3. 统计每个队伍的球员数量
nfl['Team'].value_counts()
# 4. 查找工资最高的5个球员
nfl.sort_values(by=['Salary'], ascending=False).head(5)
# 5. 排序
# 先将team按字母顺序排序
# 再将salary按降序排序
nfl.sort_values(by=['Team', 'Salary'], ascending=[True, False])
# 6. New York Jets roster队中年龄最大的球员是谁,他的生日是什么时候
nfl = nfl.reset_index().set_index('Team')
nfl
nfl.loc['New York Jets']
nfl.loc['New York Jets'].sort_values('Birthday').head(1)
练习题5
# 优化数据集以限制内存使用并最大化效用
netflix = pd.read_csv('netflix.csv', parse_dates=['date_added'])
netflix.info()
netflix.nunique()
netflix['type'] = netflix['type'].astype('category')
netflix.info()
# 找到所有标题为 "Limitless" 的行。
title = netflix['title'] == 'Limitless'
netflix[title]
# 找到所有导演为 "Robert Rodriguez" 且类型为 "Movie" 的行。
director = (netflix['director'] == 'Robert Rodriguez')
typeMovie = netflix['type'] == 'Movie'
netflix[director & typeMovie]
# 找到所有添加日期为 "2019-07-31" 或导演为 "Robert Altman" 的行。
date = netflix['date_added'] == '2019-07-31'
director = netflix['director'] == 'Robert Altman'
netflix[date | director]
# 找到所有导演为 "Orson Welles"、"Aditya Kripalani" 或 "Sam Raimi" 的行
directors = ['Orson Welles', 'Aditya Kripalani', 'Sam Raimi']
target = netflix['director'].isin(directors)
netflix[target]
# 找到所有添加日期在 2019 年 5 月 1 日至 2019 年 6 月 1 日之间的行。
addMovie = netflix['date_added'].between('2019-5-1', '2019-6-1')
netflix[addMovie]
# 删除导演列中包含 NaN 值的所有行。
netflix.dropna(subset = ['director'])
# 确定 Netflix 只在其目录中添加了一部电影的日期。
netflix.drop_duplicates(subset=['date_added'], keep=False)
练习题6
customers = pd.read_csv('customers.csv')
customers
customers['Address'].values
split = customers['Address'].str.split(',' , expand = True)
split
customers[['Street', 'City', 'State', 'Zip']] = split
customers = customers.drop(labels='Address', axis='columns')
customers
练习题8
car = pd.read_csv('used_cars.csv')
car
min_wage = pd.read_csv('minimum_wage.csv')
min_wage.head()
#1.
car.pivot_table(
values = 'Price',
index = 'Fuel',
aggfunc = 'sum'
)
#2.
car.pivot_table(
values = 'Price',
index = 'Manufacturer',
columns = 'Transmission',
aggfunc = 'count',
margins = True,
margins_name = 'Total'
)
#3.
car.pivot_table(
values = 'Price',
index = ['Year','Fuel'],
columns = 'Transmission',
aggfunc = 'mean'
)
#4.
c1 = car.pivot_table(
values = 'Price',
index = ['Year','Fuel'],
columns = 'Transmission',
aggfunc = 'mean'
)
c1.stack()
#5.
year = ['2010','2011','2012','2013','2014','2015','2016','2017']
min_wage.melt(id_vars = 'State',var_name = 'Year',value_name = 'wage')