continuous_control
                차이
문서의 선택한 두 판 사이의 차이를 보여줍니다.
| 양쪽 이전 판이전 판다음 판 | 이전 판 | ||
| continuous_control [2020/08/20 18:06] – [A2C] rex8312 | continuous_control [2024/03/23 02:38] (현재) – 바깥 편집 127.0.0.1 | ||
|---|---|---|---|
| 줄 7: | 줄 7: | ||
| <code python> | <code python> | ||
| + | class Model(nn.Module): | ||
| + | def __init__(self, | ||
| + | super().__init__() | ||
| + | |||
| + | self.action_scale = torch.FloatTensor([[ | ||
| + | 0.20833333, 1. , 1. , 1. , 0.25 , | ||
| + | 1. , 1. , 1. , 0.12077295, 1. , | ||
| + | 1. , 1. , 0.15923567, 0.15923567, 1. , | ||
| + | 1. , 1. , 0.07961783, 1. , 1. , | ||
| + | 1. , 0.15923567, 0.12077295, 1. , 1. , | ||
| + | 1. , 0.15923567, 0.15923567, 1. , 1. , | ||
| + | 1. , 0.10775862, 1. , 1. , 1. , | ||
| + | 0.15923567 | ||
| + | ]]) | ||
| + | |||
| + | vc = 4 | ||
| + | |||
| + | self.critic = nn.Sequential( | ||
| + | init_params(nn.Linear(n_features, | ||
| + | nn.LayerNorm(vc * 1024), | ||
| + | nn.ReLU(), | ||
| + | init_params(nn.Linear(vc * 1024, vc * 512)), | ||
| + | nn.LayerNorm(vc * 512), | ||
| + | nn.ReLU(), | ||
| + | init_params(nn.Linear(vc * 512, 1), True, 0.01), | ||
| + | ) | ||
| + | |||
| + | self.mean = nn.Sequential( | ||
| + | init_params(nn.Linear(n_features, | ||
| + | nn.LayerNorm(1024), | ||
| + | nn.ReLU(), | ||
| + | init_params(nn.Linear(1024, | ||
| + | nn.LayerNorm(512), | ||
| + | nn.ReLU(), | ||
| + | init_params(nn.Linear(512, | ||
| + | ) | ||
| + | |||
| + | self.logstd = nn.Sequential( | ||
| + | init_params(nn.Linear(n_features, | ||
| + | nn.LayerNorm(1024), | ||
| + | nn.ReLU(), | ||
| + | init_params(nn.Linear(1024, | ||
| + | nn.LayerNorm(512), | ||
| + | nn.ReLU(), | ||
| + | init_params(nn.Linear(512, | ||
| + | ) | ||
| + | self.max_logstd = np.log(1)# np.log(2 * std) | ||
| + | self.min_logstd = np.log(1e-9) | ||
| + | self.max_std = 1 # 2 * std | ||
| + | self.min_std = 1e-9 | ||
| + |  | ||
| + | def forward(self, | ||
| + | return self.critic(x), | ||
| + | |||
| + | def std(self, x): | ||
| + | logstd = self.logstd(x) | ||
| + | logstd = torch.clamp(logstd, | ||
| + | std = torch.exp(logstd) # * self.action_scale.to(x.device) | ||
| + | return std | ||
| + | |||
| + | |||
| + | value, mu, sigma = self.model_old( | ||
| + | torch.from_numpy(self.states[-1]).to(self.device_old) | ||
| + | ) | ||
| + | cov_mat = torch.diag_embed(sigma ** 2).to(self.device_old) | ||
| + | dist = torch.distributions.MultivariateNormal(mu, | ||
| + | action = dist.sample() | ||
| + | mask = np.random.binomial(1, | ||
| + | action = action.cpu().numpy() * mask + mu.cpu().numpy() * (1 - mask) | ||
| + | action = torch.from_numpy(action).to(mu.device).to(torch.float32) | ||
| + | action_lp = dist.log_prob(action).view(-1, | ||
| + | state, reward, done, info, = self.env.step(action.cpu().numpy()) | ||
| + | for _done, _info in zip(done, info): | ||
| + | if _done: | ||
| + | self.score_buffer.append(_info[' | ||
| + | state = self.env.reset(state, | ||
| + | reward = self.reward_scale * reward | ||
| + | mask = (1 - done).astype(np.float32) | ||
| + | </ | ||
| + | |||
| + | <code python> | ||
| + | class Model(nn.Module): | ||
| + | def __init__(self, | ||
| + | super().__init__() | ||
| + | |||
| + | self.action_scale = torch.FloatTensor([[ | ||
| + | 0.20833333, 1. , 1. , 1. , 0.25 , | ||
| + | 1. , 1. , 1. , 0.12077295, 1. , | ||
| + | 1. , 1. , 0.15923567, 0.15923567, 1. , | ||
| + | 1. , 1. , 0.07961783, 1. , 1. , | ||
| + | 1. , 0.15923567, 0.12077295, 1. , 1. , | ||
| + | 1. , 0.15923567, 0.15923567, 1. , 1. , | ||
| + | 1. , 0.10775862, 1. , 1. , 1. , | ||
| + | 0.15923567 | ||
| + | ]]) | ||
| + | |||
| + | vc = 4 | ||
| + | |||
| + | self.critic = nn.Sequential( | ||
| + | init_params(nn.Linear(n_features, | ||
| + | nn.LayerNorm(vc * 1024), | ||
| + | nn.ReLU(), | ||
| + | init_params(nn.Linear(vc * 1024, vc * 512)), | ||
| + | nn.LayerNorm(vc * 512), | ||
| + | nn.ReLU(), | ||
| + | init_params(nn.Linear(vc * 512, 1), True, 0.01), | ||
| + | ) | ||
| + | |||
| + | self.mean = nn.Sequential( | ||
| + | init_params(nn.Linear(n_features, | ||
| + | nn.LayerNorm(1024), | ||
| + | nn.ReLU(), | ||
| + | init_params(nn.Linear(1024, | ||
| + | nn.LayerNorm(512), | ||
| + | nn.ReLU(), | ||
| + | init_params(nn.Linear(512, | ||
| + | ) | ||
| + | |||
| + | self.logstd = nn.Sequential( | ||
| + | init_params(nn.Linear(n_features, | ||
| + | nn.LayerNorm(1024), | ||
| + | nn.ReLU(), | ||
| + | init_params(nn.Linear(1024, | ||
| + | nn.LayerNorm(512), | ||
| + | nn.ReLU(), | ||
| + | init_params(nn.Linear(512, | ||
| + | ) | ||
| + | self.max_logvar = np.log(1)# np.log(2 * var) | ||
| + | self.min_logvar = np.log(1e-9) | ||
| + | self.max_var = 1 # 2 * std | ||
| + | self.min_var = 1e-9 | ||
| + |  | ||
| + | self.apply(self._init_weights) | ||
| + | |||
| + | def forward(self, | ||
| + | return self.critic(x), | ||
| + | |||
| + | def _var(self, x): | ||
| + | logvar = self.logvar(x) | ||
| + | logvar = torch.clamp(logvar, | ||
| + | var = torch.exp(logvar) | ||
| + | return var | ||
| + |  | ||
| + | def _init_weights(self, | ||
| + | if isinstance(module, | ||
| + | module.weight.data.normal_(mean=0.0, | ||
| + | if isinstance(module, | ||
| + | module.bias.data.zero_() | ||
| + | elif isinstance(module, | ||
| + | module.bias.data.zero_() | ||
| + | module.weight.data.fill_(1.0) | ||
| + | |||
| + | |||
| + | def sample_action(mu, | ||
| + | return mu + torch.randn(var.size()) * var.sqrt() | ||
| + |  | ||
| + | |||
| def cont_logprob(mu, | def cont_logprob(mu, | ||
| import torch | import torch | ||
| 줄 25: | 줄 182: | ||
| - TD3: https:// | - TD3: https:// | ||
| + | |||
| + | |||
| + | {{tag>RL continuous_control action_space}} | ||
continuous_control.1597946816.txt.gz · 마지막으로 수정됨:  (바깥 편집)
                
                