# -*- coding: utf-8 -*-
"""
Created on Sun Mar 29 17:58:13 2020

@author: CHERN
"""

import datetime
import pandas as pd
from pandas import Series, DataFrame
print(pd.__version__)

import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
print(mpl.__version__)

在这里插入图片描述

1. 创建和加载数据

1.1 根据python内置list/dict生成

# -*- coding: utf-8 -*-
"""
Created on Sun Mar 29 17:58:13 2020

@author: CHERN
"""

import datetime
import pandas as pd
from pandas import Series, DataFrame
# print(pd.__version__)

import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
# print(mpl.__version__)

# 通过python构造一维序列
labels = ['a', 'b', 'c', 'd', 'e']
s = Series([1, 2, 3, 4, 5], index=labels)
print(s)
print(r"'b' in s?")
print('b' in s)
print(s['b'])

print("Series.to_dict() can convert Series to dict")
mapping = s.to_dict()
print(mapping)

print("Series(dict) can convert dict to Series")
print(Series(mapping))

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-eKV3iS94-1586606244035)(assets/2020-03-29-18-12-01.png)]

1.2 从网络中获取

# -*- coding: utf-8 -*-
"""
Created on Sun Mar 29 17:58:13 2020

@author: CHERN
"""

import datetime
import pandas as pd
from pandas import Series, DataFrame
# print(pd.__version__)

import numpy as np
from pandas_datareader import data, wb # 需要安装 pip install pandas_datareader

import matplotlib.pyplot as plt
import matplotlib

import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
# print(mpl.__version__)


# 定义获取数据的时间段
start = datetime.datetime(2010, 1, 1)
end = datetime.datetime(2016,5,20)
sh = data.DataReader("000001.SS", 'yahoo', start, end)
print(sh.head(3)) # 输出数据前三行

# 将从Yahoo上获取的pandas.Dataframe数据保存到.csv文件中
sh.to_csv('sh.csv')

在这里插入图片描述

1.3 从文件(.csv/.xlsx etc.)文件中读取数据

# -*- coding: utf-8 -*-
"""
Created on Sun Mar 29 17:58:13 2020

@author: CHERN
"""

import datetime
import pandas as pd
from pandas import Series, DataFrame
# print(pd.__version__)

import numpy as np
from pandas_datareader import data, wb # 需要安装 pip install pandas_datareader

import matplotlib.pyplot as plt
import matplotlib

import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
# print(mpl.__version__)


# 定义获取数据的时间段
df = pd.read_csv('sh.csv', index_col='Date', parse_dates=True)
print(df)

2. Series and DataFrame: （基础操作）

2.1 索引切片创建新列

# -*- coding: utf-8 -*-
"""
Created on Sun Mar 29 17:58:13 2020

@author: CHERN
"""

import datetime
import pandas as pd
from pandas import Series, DataFrame
# print(pd.__version__)

import numpy as np
from pandas_datareader import data, wb # 需要安装 pip install pandas_datareader

import matplotlib.pyplot as plt
import matplotlib

import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
# print(mpl.__version__)


# 定义获取数据的时间段
df = pd.read_csv('sh.csv', index_col='Date', parse_dates=True)
# print(df)

# type(ts) = pandas.core.series.Series
ts = df['Close'][-10:]
print(ts)
print(type(ts))

在这里插入图片描述

在IPython中继续执行：

date = ts.index[5]
date

输出：

Timestamp('2016-05-16 00:00:00')

输入：

ts[date]

输出：

2850.862060546875

输入：

ts[5]

输出：

2850.862060546875

输入：

df[['Open', 'Close']].head()

输出：

                   Open        Close
Date                                
2010-01-04  3289.750000  3243.760010
2010-01-05  3254.468018  3282.178955
2010-01-06  3277.517090  3254.215088
2010-01-07  3253.990967  3192.775879
2010-01-08  3177.259033  3195.997070

输入：

df['diff'] = df.Open - df.Close
df.head()

输出：

                   High          Low  ...    Adj Close       diff
Date                                  ...                        
2010-01-04  3295.279053  3243.319092  ...  3243.760010  45.989990
2010-01-05  3290.511963  3221.461914  ...  3282.178955 -27.710938
2010-01-06  3295.867920  3253.043945  ...  3254.215088  23.302002
2010-01-07  3268.819092  3176.707031  ...  3192.775879  61.215088
2010-01-08  3198.919922  3149.017090  ...  3195.997070 -18.738037

[5 rows x 7 columns]

输入：

del df['diff']
df.head()

输出：

                   High          Low  ...  Volume    Adj Close
Date                                  ...                     
2010-01-04  3295.279053  3243.319092  ...  109400  3243.760010
2010-01-05  3290.511963  3221.461914  ...  126200  3282.178955
2010-01-06  3295.867920  3253.043945  ...  123600  3254.215088
2010-01-07  3268.819092  3176.707031  ...  128600  3192.775879
2010-01-08  3198.919922  3149.017090  ...   98400  3195.997070

[5 rows x 6 columns]

3. 常规的金融计算

3.1 移动平均

close_px = df['Adj Close']
mavg = pd.rolling_mean(close_px, 40)
mavg[-10:]

输出：


  File "<ipython-input-14-9dd3f0f7e3fa>", line 2, in <module>
    mavg = pd.rolling_mean(close_px, 40)

  File "C:\Python\lib\site-packages\pandas\__init__.py", line 262, in __getattr__
    raise AttributeError(f"module 'pandas' has no attribute '{name}'")

AttributeError: module 'pandas' has no attribute 'rolling_mean'

close_px = df['Adj Close']
mavg = close_px.rolling(40).mean()
mavg

输出：

Date
2010-01-04            NaN
2010-01-05            NaN
2010-01-06            NaN
2010-01-07            NaN
2010-01-08            NaN
    
2016-05-16    2970.439978
2016-05-17    2967.653333
2016-05-18    2962.371130
2016-05-19    2957.559705
2016-05-20    2952.947778
Name: Adj Close, Length: 1550, dtype: float64

3.2 收益

输入：

rets = close_px / close_px.shift(1) - 1
# rets = close_px.pct_change()
rets.head()

输出：

Date
2010-01-04         NaN
2010-01-05    0.011844
2010-01-06   -0.008520
2010-01-07   -0.018880
2010-01-08    0.001009
Name: Adj Close, dtype: float64

4. 绘图基础

close_px.plot(label='AAPL')
mavg.plot(label='mavg')
plt.legend()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-VS5moCnW-1586606244038)(assets/2020-03-29-20-40-41.png)]

扩展…

# -*- coding: utf-8 -*-
"""
Created on Sun Mar 29 17:58:13 2020

@author: CHERN
"""

import datetime
import pandas as pd
from pandas import Series, DataFrame
# print(pd.__version__)

import numpy as np
from pandas_datareader import data, wb # 需要安装 pip install pandas_datareader

import matplotlib.pyplot as plt
import matplotlib

import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
# print(mpl.__version__)


# 定义获取数据的时间段
start = datetime.datetime(2010, 1, 1)
end = datetime.datetime(2016,5,20)
sh = data.DataReader(['AAPL','GE','GOOG','IBM','KO', 'MSFT', 'PEP'],'yahoo', start, end)['Adj Close']
print(sh.head(3)) # 输出数据前三行

Symbols          AAPL         GE        GOOG  ...         KO       MSFT        PEP
Date                                          ...                                 
2009-12-31  26.131752  10.526512  308.832428  ...  19.278732  23.925440  44.622261
2010-01-04  26.538483  10.749147  312.204773  ...  19.292267  24.294369  44.945187
2010-01-05  26.584366  10.804806  310.829926  ...  19.058893  24.302216  45.488274

[3 rows x 7 columns]

输入：

rets = df.pct_change()
plt.scatter(rets.PEP, rets.KO)
plt.xlabel('Returns PEP')
plt.ylabel('Returns KO')

在这里插入图片描述

pd.scatter_matrix(rets, diagonal='kde', figsize=(10, 10));

输出：


  File "C:\Python\lib\site-packages\pandas\__init__.py", line 262, in __getattr__
    raise AttributeError(f"module 'pandas' has no attribute '{name}'")

AttributeError: module 'pandas' has no attribute 'scatter_matrix'

重新输入：

pd.plotting.scatter_matrix(rets, diagonal='kde', figsize=(10, 10));

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-aAdqCmPm-1586606244040)(assets/2020-03-29-21-18-44.png)]

输入：

corr = rets.corr()
corr

输出：

Symbols      AAPL        GE      GOOG       IBM        KO      MSFT       PEP
Symbols                                                                      
AAPL     1.000000  0.387574  0.406971  0.387261  0.298461  0.393892  0.273217
GE       0.387574  1.000000  0.423675  0.532942  0.491217  0.478202  0.485198
GOOG     0.406971  0.423675  1.000000  0.402424  0.329096  0.463922  0.322701
IBM      0.387261  0.532942  0.402424  1.000000  0.449300  0.495341  0.412432
KO       0.298461  0.491217  0.329096  0.449300  1.000000  0.402174  0.643624
MSFT     0.393892  0.478202  0.463922  0.495341  0.402174  1.000000  0.414073
PEP      0.273217  0.485198  0.322701  0.412432  0.643624  0.414073  1.000000

输入：

plt.imshow(corr, cmap='hot', interpolation='none')
plt.colorbar()
plt.xticks(range(len(corr)), corr.columns)
plt.yticks(range(len(corr)), corr.columns);

在这里插入图片描述

我们经常感兴趣的一件事是预期回报（通常是回报率的均值）与我们承担的风险之间（回报率的方差）的关系。这两者之间往往存在一种权衡。
这里我们使用plt.annotate在散点图上标注标签。

plt.scatter(rets.mean(), rets.std())
plt.xlabel('Expected returns')
plt.ylabel('Risk')
for label, x, y in zip(rets.columns, rets.mean(), rets.std()):
    plt.annotate(
        label, 
        xy = (x, y), xytext = (20, -20),
        textcoords = 'offset points', ha = 'right', va = 'bottom',
        bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
        arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))

在这里插入图片描述

5. 数据对齐与Nan处理

程序：

# -*- coding: utf-8 -*-
"""
Created on Sun Mar 29 17:58:13 2020

@author: CHERN
"""

import datetime
import pandas as pd
from pandas import Series, DataFrame
# print(pd.__version__)

import numpy as np
from pandas_datareader import data, wb # 需要安装 pip install pandas_datareader

import matplotlib.pyplot as plt
import matplotlib

import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
# print(mpl.__version__)


series_list = []
securities = ['AAPL', 'GOOG', 'IBM', 'MSFT']
for security in securities:
    s = data.DataReader(security,'yahoo',
                        start=datetime.datetime(2011, 10, 1), 
                        end=datetime.datetime(2013, 1, 1))['Adj Close']
    s.name = security # Rename series to match security name
    series_list.append(s)

df = pd.concat(series_list, axis=1)
print(df.head())

                 AAPL        GOOG         IBM       MSFT
Date                                                    
2011-09-30  47.285904  256.558350  130.822800  20.321293
2011-10-03  46.452591  246.834808  129.640747  20.027370
2011-10-04  46.192177  250.012894  130.725555  20.688694
2011-10-05  46.905209  251.407669  132.304108  21.137737
2011-10-06  46.796078  256.393982  135.924973  21.505136

输入：

df.ix[0, 'AAPL'] = np.nan
df.ix[1, ['GOOG', 'IBM']] = np.nan
df.ix[[1, 2, 3], 'MSFT'] = np.nan

df.head()

输出：

AttributeError: 'DataFrame' object has no attribute 'ix'

输入：

df.loc[df.index[0],['AAPL']] = np.nan
df.loc[df.index[1],['GOOG','IBM']]=np.nan
df.loc[df.index[1:3],'MSFT']=np.nan

输出：

                 AAPL        GOOG         IBM       MSFT
Date                                                    
2011-09-30        NaN  256.558350  130.822800  20.321293
2011-10-03  46.452591         NaN         NaN        NaN
2011-10-04  46.192177  250.012894  130.725555        NaN
2011-10-05  46.905209  251.407669  132.304108        NaN
2011-10-06  46.796078  256.393982  135.924973  21.505136

输入：

(df.AAPL + df.GOOG).head()

输出：

Date
2011-09-30           NaN
2011-10-03           NaN
2011-10-04    296.205070
2011-10-05    298.312878
2011-10-06    303.190060
dtype: float64

输入：

df.ffill().head()

输出：

                 AAPL        GOOG         IBM       MSFT
Date                                                    
2011-09-30        NaN  256.558350  130.822800  20.321293
2011-10-03  46.452591  256.558350  130.822800  20.321293
2011-10-04  46.192177  250.012894  130.725555  20.321293
2011-10-05  46.905209  251.407669  132.304108  20.321293
2011-10-06  46.796078  256.393982  135.924973  21.505136

NaN

2011-10-04 296.205070
2011-10-05 298.312878
2011-10-06 303.190060
dtype: float64


输入：
```python
df.ffill().head()

输出：

                 AAPL        GOOG         IBM       MSFT
Date                                                    
2011-09-30        NaN  256.558350  130.822800  20.321293
2011-10-03  46.452591  256.558350  130.822800  20.321293
2011-10-04  46.192177  250.012894  130.725555  20.321293
2011-10-05  46.905209  251.407669  132.304108  20.321293
2011-10-06  46.796078  256.393982  135.924973  21.505136