import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.array((pd, np, plt))
data = pd.read_csv(r'I:\AIoT智能物联网工程师\AIoT智能物联网\Python数据分析\综合项目实战\课程资料\数据分析综合项目实战\job.csv')
print(data.shape)
np.array(data.columns).reshape((-1, data.shape[1]))
city_items = data.city.unique()
city_items.reshape((-1, city_items.shape[0]))
column_items = ["positionName", "companyShortName", "city", "companySize", "education", "financeStage", "industryField", "salary", "workYear","companyLabelList", "job_detail"]
data = data[column_items].drop_duplicates()
print(data.shape)
data.head()
cond = data.positionName.str.contains('数据分析')
data = data[cond]
print(data.shape)
data.tail()
data = data.reset_index(drop=True)
data.tail()
salary = data.salary.str.lower()
salary.head()
salary = salary.str.extract(r'(\d+)k-(\d+)k')
salary = salary.applymap(lambda _: int(_))
salary.head()
data.salary = salary.mean(axis=1)
data.head()
job_detail = data.job_detail
job_detail = job_detail.str.lower()
job_detail = job_detail.fillna('')
从 job_detail 中提取出技能要求 将技能分为以下几类:
+ Python
+ SQL
+ Tableau
+ Excel
+ SPSS/SAS
job_detail_items = [
('Python', lambda _: 1 if ('python' in _) else 0),
('SQL', lambda _: 1 if ('sql' in _) or ('hive' in _) else 0),
('Tableau', lambda _: 1 if ('tableau' in _) else 0),
('Excel', lambda _: 1 if ('excel' in _) else 0),
('SPSS/SAS', lambda _: 1 if ('spss' in _) or ('sas' in _) else 0),
]
for (k, m) in job_detail_items:
data[k] = job_detail.map(m)
data.head()
industryField = data.industryField
def clean_industry(_):
_ = _.split(',')
return _[1] if (_[0] == '移动互联网' and len(_) > 1) else _[0]
data.industryField = industryField.map(clean_industry)
data.head()
city_items_count = data.city.value_counts()
pd.DataFrame(dict(count=city_items_count)).T
from matplotlib.font_manager import FontManager
fm = FontManager()
np.array([font.name for font in fm.ttflist])
plt.rcParams['font.family'] = plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
np.array(city_items_count.index).reshape(-1, city_items_count.index.shape[0])
np.array(city_items_count.values).reshape(-1, city_items_count.values.shape[0])
plt.figure(figsize=(12, 9))
plt.barh(
y=city_items_count.index[::-1],
width=city_items_count.values[::-1],
color='#3c7f99'
)
plt.box(False)
plt.title(
label= '各城市对数据分析岗位的需求量',
fontsize=32, weight='bold', color='white',
backgroundcolor='#c5b783', pad=30,
)
plt.tick_params(labelsize=16)
plt.grid(axis='x', linewidth=0.5, color='#3c7f99')
industryField_items_count = data.industryField.value_counts()[:10]
industryField_items_count
plt.figure(figsize=(12, 9))
plt.barh(
y=industryField_items_count.index[::-1],
width=industryField_items_count.values[::-1],
color='#3c7f99'
)
plt.box(False)
plt.title(
label= '不同领域对数据分析岗的需求量(前10)',
fontsize=32, weight='bold', color='white',
backgroundcolor='#c5b783', pad=30,
)
plt.tick_params(labelsize=16)
plt.grid(axis='x', lw=0.5, color='#3c7f99', ls='--')
city_items_salary = data.groupby('city').salary.mean().sort_values()
city_items_salary
plt.figure(figsize=(12, 9))
plt.bar(
x=city_items_salary.index,
height=city_items_salary.values,
color=plt.cm.RdBu_r(np.linspace(0, 1, len(city_items_salary)))
)
plt.title(
label='各城市薪资状况',
fontsize=32, weight='bold', color='white', backgroundcolor='#3c7f99'
)
plt.tick_params(labelsize=16)
plt.grid(axis='y', lw=0.5, color='black')
plt.yticks(ticks=np.arange(0, 25, step=5,), labels=('', '5K', '10K', '15k', '20K'))
plt.box(False)
work_salary = data.pivot_table(
index='city',
columns='workYear',
values='salary'
)
work_salary.fillna('')
work_salary_coumns = ['应届毕业生', '1-3年', '3-5年', '5-10年']
work_salary[work_salary_coumns]
work_salary = work_salary[work_salary_coumns].sort_values(by='5-10年', ascending=False)
work_salary
work_salary_values = work_salary.values
work_salary_values = np.repeat(work_salary_values, 4, axis=1)
plt.figure(figsize=(12, 9))
plt.imshow(work_salary_values, cmap='RdBu_r')
_ = plt.xticks(np.array([1.5, 5.5, 9.5, 13.5]), work_salary.columns)
_ = plt.yticks(np.arange(13), work_salary.index)
h, w = work_salary_values.shape
for x in range(w):
for y in range(h):
if (x % 4 == 0) and (~np.isnan(work_salary_values[y, x])):
text = plt.text(x+1.5, y, round(work_salary_values[y, x], 1),
ha='center', va='center',color='r', fontsize=16
)
plt.colorbar(shrink=0.85)
plt.tick_params(labelsize=16)
education = data.education.value_counts(normalize=True)
education
plt.figure(figsize=(9, 9))
_ = plt.pie(education, labels=education.index, autopct='%0.2f%%',
wedgeprops=dict(lw=3,width=0.5),
pctdistance=0.8,
textprops=dict(fontsize=20)
)
_ = plt.title(label='学历要求',fontsize=32,weight='bold', color='white', backgroundcolor='#c5b893')
job_detail_items_key = [a for (a, b) in job_detail_items]
def get_level(_):
for k in job_detail_items_key:
if _[k] == 1:
_['skill'] = k
break
else:
_['skill'] = '其他'
return _
data = data.apply(get_level, axis=1)
data.head(20)
"""
job_detail_items_key = [
'Python',
'SQL',
'Tableau',
'Excel',
'SPSS/SAS',
]
"""
x = data.loc[data.skill!='其他'][['salary', 'skill']]
cond_data = []
for _ in job_detail_items_key:
cond_data.append(data.loc[data.skill!='其他']['salary'][x.skill==_])
plt.figure(figsize=(12, 9))
plt.title(
label='不同技能的薪资水平对比',
fontsize=32, weight='bold', color='white',
backgroundcolor='#c5b783', pad=30
)
_ = plt.boxplot(
x=cond_data,
vert=False, labels=job_detail_items_key
)
plt.tick_params(axis='both', labelsize=16)
plt.grid(axis='x', linewidth=0.75)
_ = plt.xticks(np.arange(0, 61, 10), [str(i) + 'K' for i in range(0, 61, 10)])
plt.xlabel('薪资', fontsize=18)
plt.ylabel('技能', fontsize=18)
skill_count = data[ data.companySize == '2000人以上' ][ job_detail_items_key ].sum()
skill_count
plt.figure(figsize=(12, 9))
plt.bar(
np.arange(5), skill_count,
tick_label=job_detail_items_key,
width=0.5,
color=plt.cm.RdBu_r(skill_count/skill_count.max())
)
_ = plt.title(
label='大公司对技能要求',
fontsize=32, weight='bold', color='white',
backgroundcolor='#c5b783', pad=30
)
plt.tick_params(labelsize=16,)
plt.grid(axis='y')
plt.box(False)
from matplotlib import gridspec
workYear_map = {
'5-10年': 5,
'3-5年': 4,
'1-3年': 3,
'1年以下': 2,
'应届毕业生': 1
}
color_map = {
5: '#ff0000',
4: '#ffa500',
3: '#c5b783',
2: '#3c7f99',
1: '#0000cd'
}
cond = data.workYear.isin(workYear_map)
data = data[cond]
data.loc[:,'workYear'] = data.workYear.map(workYear_map)
data.head()
data.loc[:,'companySize'] = data.companySize.astype('category')
companySize_items = ['2000人以上', '500-2000人', '150-500人', '50-150人', '15-50人', '少于15人']
data.loc[:,'companySize'] = data.companySize.cat.reorder_categories(companySize_items)
data.sort_values(by='companySize', inplace=True, ascending=False)
data.head()
plt.figure(figsize=(12,11))
gs = gridspec.GridSpec(10, 1)
plt.subplot(gs[:8])
plt.suptitle(
t='不同规模公司招聘差异',
fontsize=32,
weight='bold', color='white', backgroundcolor='#3c7f99'
)
plt.scatter(data.salary, data.companySize,
c=data.workYear.map(color_map),
s=(data.workYear * 100),
alpha=0.35
)
plt.scatter(data.salary, data.companySize,
c=data.workYear.map(color_map),
)
plt.grid(axis='x')
plt.xticks(np.arange(0, 61, 10), [str(i) + 'K' for i in range(0, 61, 10)])
plt.xlabel('薪资', fontsize=18)
plt.box(False)
plt.tick_params(labelsize=18)
plt.subplot(gs[9:])
x = np.arange(5)[::-1]
y = np.zeros(len(x))
s = x * 100
plt.scatter(x,y, s=s, c=color_map.values(), alpha=0.3)
plt.scatter(x,y, c=color_map.values())
plt.box(False)
plt.xticks(ticks=x, labels=list(workYear_map.keys()), fontsize=14)
plt.yticks(np.arange(1), labels=['经验'], fontsize=18)