====== Continuous Control ======
====== A2C =====
* https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On
* https://datascience.stackexchange.com/questions/49625/ppo-a2c-for-continuous-action-spaces-math-and-code
class Model(nn.Module):
def __init__(self, args, n_features, n_actions, std):
super().__init__()
self.action_scale = torch.FloatTensor([[
0.20833333, 1. , 1. , 1. , 0.25 ,
1. , 1. , 1. , 0.12077295, 1. ,
1. , 1. , 0.15923567, 0.15923567, 1. ,
1. , 1. , 0.07961783, 1. , 1. ,
1. , 0.15923567, 0.12077295, 1. , 1. ,
1. , 0.15923567, 0.15923567, 1. , 1. ,
1. , 0.10775862, 1. , 1. , 1. ,
0.15923567
]])
vc = 4
self.critic = nn.Sequential(
init_params(nn.Linear(n_features, vc * 1024)),
nn.LayerNorm(vc * 1024),
nn.ReLU(),
init_params(nn.Linear(vc * 1024, vc * 512)),
nn.LayerNorm(vc * 512),
nn.ReLU(),
init_params(nn.Linear(vc * 512, 1), True, 0.01),
)
self.mean = nn.Sequential(
init_params(nn.Linear(n_features, 1024)),
nn.LayerNorm(1024),
nn.ReLU(),
init_params(nn.Linear(1024, 512)),
nn.LayerNorm(512),
nn.ReLU(),
init_params(nn.Linear(512, n_actions), True, 0.01),
)
self.logstd = nn.Sequential(
init_params(nn.Linear(n_features, 1024)),
nn.LayerNorm(1024),
nn.ReLU(),
init_params(nn.Linear(1024, 512)),
nn.LayerNorm(512),
nn.ReLU(),
init_params(nn.Linear(512, n_actions), True, np.log(std)),
)
self.max_logstd = np.log(1)# np.log(2 * std)
self.min_logstd = np.log(1e-9)
self.max_std = 1 # 2 * std
self.min_std = 1e-9
def forward(self, x):
return self.critic(x), self.mean(x), self.std(x)
def std(self, x):
logstd = self.logstd(x)
logstd = torch.clamp(logstd, self.min_logstd, self.max_logstd)
std = torch.exp(logstd) # * self.action_scale.to(x.device)
return std
value, mu, sigma = self.model_old(
torch.from_numpy(self.states[-1]).to(self.device_old)
)
cov_mat = torch.diag_embed(sigma ** 2).to(self.device_old)
dist = torch.distributions.MultivariateNormal(mu, cov_mat)
action = dist.sample()
mask = np.random.binomial(1, self.eps, action.size(0)).reshape(-1, 1)
action = action.cpu().numpy() * mask + mu.cpu().numpy() * (1 - mask)
action = torch.from_numpy(action).to(mu.device).to(torch.float32)
action_lp = dist.log_prob(action).view(-1, 1)
state, reward, done, info, = self.env.step(action.cpu().numpy())
for _done, _info in zip(done, info):
if _done:
self.score_buffer.append(_info['score'])
state = self.env.reset(state, done)
reward = self.reward_scale * reward
mask = (1 - done).astype(np.float32)
class Model(nn.Module):
def __init__(self, args, n_features, n_actions, var):
super().__init__()
self.action_scale = torch.FloatTensor([[
0.20833333, 1. , 1. , 1. , 0.25 ,
1. , 1. , 1. , 0.12077295, 1. ,
1. , 1. , 0.15923567, 0.15923567, 1. ,
1. , 1. , 0.07961783, 1. , 1. ,
1. , 0.15923567, 0.12077295, 1. , 1. ,
1. , 0.15923567, 0.15923567, 1. , 1. ,
1. , 0.10775862, 1. , 1. , 1. ,
0.15923567
]])
vc = 4
self.critic = nn.Sequential(
init_params(nn.Linear(n_features, vc * 1024)),
nn.LayerNorm(vc * 1024),
nn.ReLU(),
init_params(nn.Linear(vc * 1024, vc * 512)),
nn.LayerNorm(vc * 512),
nn.ReLU(),
init_params(nn.Linear(vc * 512, 1), True, 0.01),
)
self.mean = nn.Sequential(
init_params(nn.Linear(n_features, 1024)),
nn.LayerNorm(1024),
nn.ReLU(),
init_params(nn.Linear(1024, 512)),
nn.LayerNorm(512),
nn.ReLU(),
init_params(nn.Linear(512, n_actions), True, 0.01),
)
self.logstd = nn.Sequential(
init_params(nn.Linear(n_features, 1024)),
nn.LayerNorm(1024),
nn.ReLU(),
init_params(nn.Linear(1024, 512)),
nn.LayerNorm(512),
nn.ReLU(),
init_params(nn.Linear(512, n_actions), True, np.log(var)),
)
self.max_logvar = np.log(1)# np.log(2 * var)
self.min_logvar = np.log(1e-9)
self.max_var = 1 # 2 * std
self.min_var = 1e-9
self.apply(self._init_weights)
def forward(self, x):
return self.critic(x), self.mean(x), self._var(x)
def _var(self, x):
logvar = self.logvar(x)
logvar = torch.clamp(logvar, self.min_logvar, self.max_logvar)
var = torch.exp(logvar)
return var
def _init_weights(self, module):
if isinstance(module, (nn.Linear, nn.Embedding)):
module.weight.data.normal_(mean=0.0, std=0.02)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
def sample_action(mu, var):
return mu + torch.randn(var.size()) * var.sqrt()
def cont_logprob(mu, var, actions):
import torch
import math
p1 = - ((mu - actions) ** 2) / (2 * var.clamp(min=1e-3))
p2 = - torch.log(torch.sqrt(2 * math.pi * var))
return (p1 + p2).sum(-1, keepdims=True)
def cont_entropy(var):
import torch
import math
entropy = (torch.log(2 * math.pi * var) + 1) / 2
return entropy.sum(-1)
===== ... ======
- TD3: https://towardsdatascience.com/td3-learning-to-run-with-ai-40dfc512f93
{{tag>RL continuous_control action_space}}