continuous_control
Continuous Control
A2C
class Model(nn.Module): def __init__(self, args, n_features, n_actions, std): super().__init__() self.action_scale = torch.FloatTensor([[ 0.20833333, 1. , 1. , 1. , 0.25 , 1. , 1. , 1. , 0.12077295, 1. , 1. , 1. , 0.15923567, 0.15923567, 1. , 1. , 1. , 0.07961783, 1. , 1. , 1. , 0.15923567, 0.12077295, 1. , 1. , 1. , 0.15923567, 0.15923567, 1. , 1. , 1. , 0.10775862, 1. , 1. , 1. , 0.15923567 ]]) vc = 4 self.critic = nn.Sequential( init_params(nn.Linear(n_features, vc * 1024)), nn.LayerNorm(vc * 1024), nn.ReLU(), init_params(nn.Linear(vc * 1024, vc * 512)), nn.LayerNorm(vc * 512), nn.ReLU(), init_params(nn.Linear(vc * 512, 1), True, 0.01), ) self.mean = nn.Sequential( init_params(nn.Linear(n_features, 1024)), nn.LayerNorm(1024), nn.ReLU(), init_params(nn.Linear(1024, 512)), nn.LayerNorm(512), nn.ReLU(), init_params(nn.Linear(512, n_actions), True, 0.01), ) self.logstd = nn.Sequential( init_params(nn.Linear(n_features, 1024)), nn.LayerNorm(1024), nn.ReLU(), init_params(nn.Linear(1024, 512)), nn.LayerNorm(512), nn.ReLU(), init_params(nn.Linear(512, n_actions), True, np.log(std)), ) self.max_logstd = np.log(1)# np.log(2 * std) self.min_logstd = np.log(1e-9) self.max_std = 1 # 2 * std self.min_std = 1e-9 def forward(self, x): return self.critic(x), self.mean(x), self.std(x) def std(self, x): logstd = self.logstd(x) logstd = torch.clamp(logstd, self.min_logstd, self.max_logstd) std = torch.exp(logstd) # * self.action_scale.to(x.device) return std value, mu, sigma = self.model_old( torch.from_numpy(self.states[-1]).to(self.device_old) ) cov_mat = torch.diag_embed(sigma ** 2).to(self.device_old) dist = torch.distributions.MultivariateNormal(mu, cov_mat) action = dist.sample() mask = np.random.binomial(1, self.eps, action.size(0)).reshape(-1, 1) action = action.cpu().numpy() * mask + mu.cpu().numpy() * (1 - mask) action = torch.from_numpy(action).to(mu.device).to(torch.float32) action_lp = dist.log_prob(action).view(-1, 1) state, reward, done, info, = self.env.step(action.cpu().numpy()) for _done, _info in zip(done, info): if _done: self.score_buffer.append(_info['score']) state = self.env.reset(state, done) reward = self.reward_scale * reward mask = (1 - done).astype(np.float32)
class Model(nn.Module): def __init__(self, args, n_features, n_actions, var): super().__init__() self.action_scale = torch.FloatTensor([[ 0.20833333, 1. , 1. , 1. , 0.25 , 1. , 1. , 1. , 0.12077295, 1. , 1. , 1. , 0.15923567, 0.15923567, 1. , 1. , 1. , 0.07961783, 1. , 1. , 1. , 0.15923567, 0.12077295, 1. , 1. , 1. , 0.15923567, 0.15923567, 1. , 1. , 1. , 0.10775862, 1. , 1. , 1. , 0.15923567 ]]) vc = 4 self.critic = nn.Sequential( init_params(nn.Linear(n_features, vc * 1024)), nn.LayerNorm(vc * 1024), nn.ReLU(), init_params(nn.Linear(vc * 1024, vc * 512)), nn.LayerNorm(vc * 512), nn.ReLU(), init_params(nn.Linear(vc * 512, 1), True, 0.01), ) self.mean = nn.Sequential( init_params(nn.Linear(n_features, 1024)), nn.LayerNorm(1024), nn.ReLU(), init_params(nn.Linear(1024, 512)), nn.LayerNorm(512), nn.ReLU(), init_params(nn.Linear(512, n_actions), True, 0.01), ) self.logstd = nn.Sequential( init_params(nn.Linear(n_features, 1024)), nn.LayerNorm(1024), nn.ReLU(), init_params(nn.Linear(1024, 512)), nn.LayerNorm(512), nn.ReLU(), init_params(nn.Linear(512, n_actions), True, np.log(var)), ) self.max_logvar = np.log(1)# np.log(2 * var) self.min_logvar = np.log(1e-9) self.max_var = 1 # 2 * std self.min_var = 1e-9 self.apply(self._init_weights) def forward(self, x): return self.critic(x), self.mean(x), self._var(x) def _var(self, x): logvar = self.logvar(x) logvar = torch.clamp(logvar, self.min_logvar, self.max_logvar) var = torch.exp(logvar) return var def _init_weights(self, module): if isinstance(module, (nn.Linear, nn.Embedding)): module.weight.data.normal_(mean=0.0, std=0.02) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) def sample_action(mu, var): return mu + torch.randn(var.size()) * var.sqrt() def cont_logprob(mu, var, actions): import torch import math p1 = - ((mu - actions) ** 2) / (2 * var.clamp(min=1e-3)) p2 = - torch.log(torch.sqrt(2 * math.pi * var)) return (p1 + p2).sum(-1, keepdims=True) def cont_entropy(var): import torch import math entropy = (torch.log(2 * math.pi * var) + 1) / 2 return entropy.sum(-1)
...
continuous_control.txt · 마지막으로 수정됨: 2024/03/23 02:38 저자 127.0.0.1