代码都是学习别人的,但我分享几点我踩过的大坑。
1.蒙特卡洛的V值
2.样本不是独立同分布
之后再 详述一下
"""
"""
import torch.nn.functional as F
import torchvision.models as models
import retro
import hiddenlayer as hl
import torch
# import retro
import pandas as pd
import numpy as np
# import mujoco_py
import gym
import time
import PIL.Image as Image
import torch.nn as nn
import os
from warnings import catch_warnings
from torch.distributions import Normal
import warnings
class DQBReplayer:
def __init__(self,capacity):
self.memory = pd.DataFrame(index=range(capacity),columns=['observation','action','reward','next_observation','done','step'])
self.i=0
self.count=0
self.capacity=capacity
def store(self,*args):
self.memory.loc[self.i]=args
self.i=(self.i+1)%self.capacity
self.count=min(self.count+1,self.capacity)
def sample(self,size=32):
indics=np.random.choice(self.count,size=size)
# indics=range(size)
return (np.stack(self.memory.loc[indics,field]) for field in self.memory.columns)#为什么#是第indics行和feild列
def clear(self):
self.memory.drop(self.memory.index,inplace=True)
self.count=0
self.i=0
#
class PolicyNetwork(nn.Module):
def __init__(self):
super(PolicyNetwork, self).__init__()
self.relu = nn.ReLU()
self.fc1 = nn.Linear(3, 64)
self.fc2 = nn.Linear(64, 256)
self.fc_mu = nn.Linear(256, 1)
self.fc_std = nn.Linear(256, 1)
self.tanh = nn.Tanh()
self.softplus = nn.Softplus()
def forward(self, x):
x = self.relu(self.fc1(x))
x = self.relu(self.fc2(x))
mu = 2 * self.tanh(self.fc_mu(x))
std = self.softplus(self.fc_std(x)) + 1e-3
return mu, std
def select_action(self, state):
state=torch.Tensor(state).to(device)
with torch.no_grad():
mu, std = self.forward(state)
n = Normal(mu, std)
action = n.sample()
return np.clip(action.item(), -2., 2.)
class ValueNetwork(nn.Module):
def __init__(self):
super(ValueNetwork, self).__init__()
self.relu = nn.ReLU()
self.fc1 = nn.Linear(3, 64)
self.fc2 = nn.Linear(64, 256)
self.fc3 = nn.Linear(256, 1)
def forward(self, x):
x = self.relu(self.fc1(x))
x = self.relu(self.fc2(x))
x = self.fc3(x)
return x
class PPO(nn.Module):
def __init__(self):
super(PPO,self).__init__()
self.replayer=DQBReplayer(capacity=1000)
self.gamma=0.99
self.policy = PolicyNetwork().to(device)
self.old_policy = PolicyNetwork().to(device)
self.value = ValueNetwork().to(device)
self.clip_ratio=0.0005
self.entropy_loss_ratio=0.05
self.learn_step=0
self.canvasl = hl.Canvas()
self.history = hl.History()
if __name__ == "__main__":
max_score=12 # print()
batch_list=[]
last_r_s,last_step=0,0
device=torch.device("cuda" if torch.cuda.is_available() else"cpu")#
store_count=0
# print(gym.envs.registry.all())
env=gym.make("Pendulum-v0")
env.unwrapped
net=PPO().to(device)
mse = nn.MSELoss()
optim = torch.optim.Adam(net.policy.parameters(), lr=2e-5)
value_optim= torch.optim.Adam(net.value.parameters(), lr=4e-5)
for i in range(200000):
state = env.reset()
epoch_reward=0#每局游戏的累计奖励
for step in range(200):
# env.render()
action=net.policy.select_action(state)
next_state,r,done,info=env.step([action])
reward = (r + 8.1) / 8.1
epoch_reward+=reward
net.replayer.store(state, action, reward, next_state, done,step)
net.learn_step += 1
state = next_state
if done or step>10000:
last_r_s,last_step=epoch_reward,step
print("epsiode:{}---avg_reward:{:.2f}---sum_step: {}".format(i, epoch_reward,step))
net.old_policy.load_state_dict(net.policy.state_dict())
for K in range(10):
sample_n = net.replayer.count
states, actions, rewards, next_states, dones, steps = net.replayer.sample(32)
next_states = torch.Tensor(next_states).to(device)
rewards = torch.Tensor(rewards).unsqueeze(1).to(device)
states = torch.Tensor(states).to(device)
actions=torch.Tensor(actions).to(device)
with torch.no_grad(): # 为什么
old_mu, old_std = net.old_policy(states)
old_n = Normal(old_mu, old_std)
value_target = rewards + net.gamma * net.value(next_states)
advantage = value_target - net.value(states)
mu, std = net.policy(states)
n = Normal(mu, std)
log_prob = n.log_prob(actions)
old_log_prob = old_n.log_prob(actions)
ratio = torch.exp(log_prob - old_log_prob)
L1 = ratio * advantage
L2 = torch.clamp(ratio, 0.8, 1.2) * advantage
loss = torch.min(L1, L2)
loss = - loss.mean()
# writer.add_scalar('action loss', loss.item(), steps)
optim.zero_grad()
loss.backward()
optim.step()
#clear
value_loss = F.mse_loss(value_target, net.value(states))
value_optim.zero_grad()
value_loss.backward()
value_optim.step()
net.replayer.clear()
# writer.add_scalar('value loss', value_loss.item(), steps)