# -*- coding: utf-8 -*-
"""
Created on Sun Mar 29 17:58:13 2020
@author: CHERN
"""
import datetime
import pandas as pd
from pandas import Series, DataFrame
print(pd.__version__)
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
print(mpl.__version__)
1. 创建和加载数据
1.1 根据python内置list/dict生成
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 29 17:58:13 2020
@author: CHERN
"""
import datetime
import pandas as pd
from pandas import Series, DataFrame
# print(pd.__version__)
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
# print(mpl.__version__)
# 通过python构造一维序列
labels = ['a', 'b', 'c', 'd', 'e']
s = Series([1, 2, 3, 4, 5], index=labels)
print(s)
print(r"'b' in s?")
print('b' in s)
print(s['b'])
print("Series.to_dict() can convert Series to dict")
mapping = s.to_dict()
print(mapping)
print("Series(dict) can convert dict to Series")
print(Series(mapping))
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-eKV3iS94-1586606244035)(assets/2020-03-29-18-12-01.png)]
1.2 从网络中获取
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 29 17:58:13 2020
@author: CHERN
"""
import datetime
import pandas as pd
from pandas import Series, DataFrame
# print(pd.__version__)
import numpy as np
from pandas_datareader import data, wb # 需要安装 pip install pandas_datareader
import matplotlib.pyplot as plt
import matplotlib
import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
# print(mpl.__version__)
# 定义获取数据的时间段
start = datetime.datetime(2010, 1, 1)
end = datetime.datetime(2016,5,20)
sh = data.DataReader("000001.SS", 'yahoo', start, end)
print(sh.head(3)) # 输出数据前三行
# 将从Yahoo上获取的pandas.Dataframe数据保存到.csv文件中
sh.to_csv('sh.csv')
1.3 从文件(.csv/.xlsx etc.)文件中读取数据
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 29 17:58:13 2020
@author: CHERN
"""
import datetime
import pandas as pd
from pandas import Series, DataFrame
# print(pd.__version__)
import numpy as np
from pandas_datareader import data, wb # 需要安装 pip install pandas_datareader
import matplotlib.pyplot as plt
import matplotlib
import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
# print(mpl.__version__)
# 定义获取数据的时间段
df = pd.read_csv('sh.csv', index_col='Date', parse_dates=True)
print(df)
2. Series and DataFrame: (基础操作)
2.1 索引切片创建新列
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 29 17:58:13 2020
@author: CHERN
"""
import datetime
import pandas as pd
from pandas import Series, DataFrame
# print(pd.__version__)
import numpy as np
from pandas_datareader import data, wb # 需要安装 pip install pandas_datareader
import matplotlib.pyplot as plt
import matplotlib
import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
# print(mpl.__version__)
# 定义获取数据的时间段
df = pd.read_csv('sh.csv', index_col='Date', parse_dates=True)
# print(df)
# type(ts) = pandas.core.series.Series
ts = df['Close'][-10:]
print(ts)
print(type(ts))
在IPython中继续执行:
date = ts.index[5]
date
输出:
Timestamp('2016-05-16 00:00:00')
输入:
ts[date]
输出:
2850.862060546875
输入:
ts[5]
输出:
2850.862060546875
输入:
df[['Open', 'Close']].head()
输出:
Open Close
Date
2010-01-04 3289.750000 3243.760010
2010-01-05 3254.468018 3282.178955
2010-01-06 3277.517090 3254.215088
2010-01-07 3253.990967 3192.775879
2010-01-08 3177.259033 3195.997070
输入:
df['diff'] = df.Open - df.Close
df.head()
输出:
High Low ... Adj Close diff
Date ...
2010-01-04 3295.279053 3243.319092 ... 3243.760010 45.989990
2010-01-05 3290.511963 3221.461914 ... 3282.178955 -27.710938
2010-01-06 3295.867920 3253.043945 ... 3254.215088 23.302002
2010-01-07 3268.819092 3176.707031 ... 3192.775879 61.215088
2010-01-08 3198.919922 3149.017090 ... 3195.997070 -18.738037
[5 rows x 7 columns]
输入:
del df['diff']
df.head()
输出:
High Low ... Volume Adj Close
Date ...
2010-01-04 3295.279053 3243.319092 ... 109400 3243.760010
2010-01-05 3290.511963 3221.461914 ... 126200 3282.178955
2010-01-06 3295.867920 3253.043945 ... 123600 3254.215088
2010-01-07 3268.819092 3176.707031 ... 128600 3192.775879
2010-01-08 3198.919922 3149.017090 ... 98400 3195.997070
[5 rows x 6 columns]
3. 常规的金融计算
3.1 移动平均
close_px = df['Adj Close']
mavg = pd.rolling_mean(close_px, 40)
mavg[-10:]
输出:
File "<ipython-input-14-9dd3f0f7e3fa>", line 2, in <module>
mavg = pd.rolling_mean(close_px, 40)
File "C:\Python\lib\site-packages\pandas\__init__.py", line 262, in __getattr__
raise AttributeError(f"module 'pandas' has no attribute '{name}'")
AttributeError: module 'pandas' has no attribute 'rolling_mean'
close_px = df['Adj Close']
mavg = close_px.rolling(40).mean()
mavg
输出:
Date
2010-01-04 NaN
2010-01-05 NaN
2010-01-06 NaN
2010-01-07 NaN
2010-01-08 NaN
2016-05-16 2970.439978
2016-05-17 2967.653333
2016-05-18 2962.371130
2016-05-19 2957.559705
2016-05-20 2952.947778
Name: Adj Close, Length: 1550, dtype: float64
3.2 收益
输入:
rets = close_px / close_px.shift(1) - 1
# rets = close_px.pct_change()
rets.head()
输出:
Date
2010-01-04 NaN
2010-01-05 0.011844
2010-01-06 -0.008520
2010-01-07 -0.018880
2010-01-08 0.001009
Name: Adj Close, dtype: float64
4. 绘图基础
close_px.plot(label='AAPL')
mavg.plot(label='mavg')
plt.legend()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-VS5moCnW-1586606244038)(assets/2020-03-29-20-40-41.png)]
扩展…
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 29 17:58:13 2020
@author: CHERN
"""
import datetime
import pandas as pd
from pandas import Series, DataFrame
# print(pd.__version__)
import numpy as np
from pandas_datareader import data, wb # 需要安装 pip install pandas_datareader
import matplotlib.pyplot as plt
import matplotlib
import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
# print(mpl.__version__)
# 定义获取数据的时间段
start = datetime.datetime(2010, 1, 1)
end = datetime.datetime(2016,5,20)
sh = data.DataReader(['AAPL','GE','GOOG','IBM','KO', 'MSFT', 'PEP'],'yahoo', start, end)['Adj Close']
print(sh.head(3)) # 输出数据前三行
Symbols AAPL GE GOOG ... KO MSFT PEP
Date ...
2009-12-31 26.131752 10.526512 308.832428 ... 19.278732 23.925440 44.622261
2010-01-04 26.538483 10.749147 312.204773 ... 19.292267 24.294369 44.945187
2010-01-05 26.584366 10.804806 310.829926 ... 19.058893 24.302216 45.488274
[3 rows x 7 columns]
输入:
rets = df.pct_change()
plt.scatter(rets.PEP, rets.KO)
plt.xlabel('Returns PEP')
plt.ylabel('Returns KO')
pd.scatter_matrix(rets, diagonal='kde', figsize=(10, 10));
输出:
File "C:\Python\lib\site-packages\pandas\__init__.py", line 262, in __getattr__
raise AttributeError(f"module 'pandas' has no attribute '{name}'")
AttributeError: module 'pandas' has no attribute 'scatter_matrix'
重新输入:
pd.plotting.scatter_matrix(rets, diagonal='kde', figsize=(10, 10));
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-aAdqCmPm-1586606244040)(assets/2020-03-29-21-18-44.png)]
输入:
corr = rets.corr()
corr
输出:
Symbols AAPL GE GOOG IBM KO MSFT PEP
Symbols
AAPL 1.000000 0.387574 0.406971 0.387261 0.298461 0.393892 0.273217
GE 0.387574 1.000000 0.423675 0.532942 0.491217 0.478202 0.485198
GOOG 0.406971 0.423675 1.000000 0.402424 0.329096 0.463922 0.322701
IBM 0.387261 0.532942 0.402424 1.000000 0.449300 0.495341 0.412432
KO 0.298461 0.491217 0.329096 0.449300 1.000000 0.402174 0.643624
MSFT 0.393892 0.478202 0.463922 0.495341 0.402174 1.000000 0.414073
PEP 0.273217 0.485198 0.322701 0.412432 0.643624 0.414073 1.000000
输入:
plt.imshow(corr, cmap='hot', interpolation='none')
plt.colorbar()
plt.xticks(range(len(corr)), corr.columns)
plt.yticks(range(len(corr)), corr.columns);
我们经常感兴趣的一件事是预期回报(通常是回报率的均值)与我们承担的风险之间(回报率的方差)的关系。这两者之间往往存在一种权衡。
这里我们使用plt.annotate
在散点图上标注标签。
plt.scatter(rets.mean(), rets.std())
plt.xlabel('Expected returns')
plt.ylabel('Risk')
for label, x, y in zip(rets.columns, rets.mean(), rets.std()):
plt.annotate(
label,
xy = (x, y), xytext = (20, -20),
textcoords = 'offset points', ha = 'right', va = 'bottom',
bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
5. 数据对齐与Nan处理
程序:
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 29 17:58:13 2020
@author: CHERN
"""
import datetime
import pandas as pd
from pandas import Series, DataFrame
# print(pd.__version__)
import numpy as np
from pandas_datareader import data, wb # 需要安装 pip install pandas_datareader
import matplotlib.pyplot as plt
import matplotlib
import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
# print(mpl.__version__)
series_list = []
securities = ['AAPL', 'GOOG', 'IBM', 'MSFT']
for security in securities:
s = data.DataReader(security,'yahoo',
start=datetime.datetime(2011, 10, 1),
end=datetime.datetime(2013, 1, 1))['Adj Close']
s.name = security # Rename series to match security name
series_list.append(s)
df = pd.concat(series_list, axis=1)
print(df.head())
AAPL GOOG IBM MSFT
Date
2011-09-30 47.285904 256.558350 130.822800 20.321293
2011-10-03 46.452591 246.834808 129.640747 20.027370
2011-10-04 46.192177 250.012894 130.725555 20.688694
2011-10-05 46.905209 251.407669 132.304108 21.137737
2011-10-06 46.796078 256.393982 135.924973 21.505136
输入:
df.ix[0, 'AAPL'] = np.nan
df.ix[1, ['GOOG', 'IBM']] = np.nan
df.ix[[1, 2, 3], 'MSFT'] = np.nan
df.head()
输出:
AttributeError: 'DataFrame' object has no attribute 'ix'
输入:
df.loc[df.index[0],['AAPL']] = np.nan
df.loc[df.index[1],['GOOG','IBM']]=np.nan
df.loc[df.index[1:3],'MSFT']=np.nan
输出:
AAPL GOOG IBM MSFT
Date
2011-09-30 NaN 256.558350 130.822800 20.321293
2011-10-03 46.452591 NaN NaN NaN
2011-10-04 46.192177 250.012894 130.725555 NaN
2011-10-05 46.905209 251.407669 132.304108 NaN
2011-10-06 46.796078 256.393982 135.924973 21.505136
输入:
(df.AAPL + df.GOOG).head()
输出:
Date
2011-09-30 NaN
2011-10-03 NaN
2011-10-04 296.205070
2011-10-05 298.312878
2011-10-06 303.190060
dtype: float64
输入:
df.ffill().head()
输出:
AAPL GOOG IBM MSFT
Date
2011-09-30 NaN 256.558350 130.822800 20.321293
2011-10-03 46.452591 256.558350 130.822800 20.321293
2011-10-04 46.192177 250.012894 130.725555 20.321293
2011-10-05 46.905209 251.407669 132.304108 20.321293
2011-10-06 46.796078 256.393982 135.924973 21.505136
NaN
2011-10-04 296.205070
2011-10-05 298.312878
2011-10-06 303.190060
dtype: float64
输入:
```python
df.ffill().head()
输出:
AAPL GOOG IBM MSFT
Date
2011-09-30 NaN 256.558350 130.822800 20.321293
2011-10-03 46.452591 256.558350 130.822800 20.321293
2011-10-04 46.192177 250.012894 130.725555 20.321293
2011-10-05 46.905209 251.407669 132.304108 20.321293
2011-10-06 46.796078 256.393982 135.924973 21.505136