example:ppg
차이
문서의 선택한 두 판 사이의 차이를 보여줍니다.
다음 판 | 이전 판 | ||
example:ppg [2020/10/19 13:56] – 만듦 rex8312 | example:ppg [2024/03/23 02:42] (현재) – 바깥 편집 127.0.0.1 | ||
---|---|---|---|
줄 78: | 줄 78: | ||
module.weight.data.fill_(1.0) | module.weight.data.fill_(1.0) | ||
- | def forward(self, | + | def forward(self, |
- | | + | |
- | x = self.shared(x) | + | logit = self.policy(self.shared(x)) |
- | logit = self.policy(x) | + | return v, F.log_softmax(logit, |
- | return | + | |
- | | + | def aux_forward(self, |
- | v = self.vf(x) | + | v = self.vf(x) |
- | | + | |
- | return v, F.log_softmax(logit, | + | aux_v = self.aux(x) |
+ | logit = self.policy(x) | ||
+ | return v, aux_v, F.log_softmax(logit, | ||
줄 106: | 줄 108: | ||
model_old = Model(n_features, | model_old = Model(n_features, | ||
model_old.load_state_dict(model.state_dict()) | model_old.load_state_dict(model.state_dict()) | ||
- | | + | |
+ | aux_optimizer | ||
# 테스트 게임 시작 | # 테스트 게임 시작 | ||
줄 212: | 줄 215: | ||
# 미니배치 준비 | # 미니배치 준비 | ||
sel = idx[mb_i * args.mini_batch_size: | sel = idx[mb_i * args.mini_batch_size: | ||
- | obs = torch.from_numpy(FD_obs[sel]).float() | + | obs = torch.tensor(FD_obs[sel], device=args.device).float() |
- | action = torch.from_numpy(FD_action[sel]).long() | + | action = torch.tensor(FD_action[sel], device=args.device).long() |
- | ret = torch.from_numpy(FD_ret[sel]).float() | + | ret = torch.tensor(FD_ret[sel], device=args.device).float() |
- | adv = torch.from_numpy(FD_adv[sel]).float() | + | adv = torch.tensor(FD_adv[sel], device=args.device).float() |
- | logp_old = torch.from_numpy(FD_logp[sel]).float() | + | logp_old = torch.tensor(FD_logp[sel], device=args.device).float() |
# 그래프 생성 | # 그래프 생성 | ||
줄 224: | 줄 227: | ||
# loss_v | # loss_v | ||
- | | + | # loss_v = 0.5 * (ret - value.view(ret.shape)).pow(2).mean() |
- | loss_v = F.smooth_l1_loss(value, | + | |
+ | loss_v = F.mse_loss(value, ret.view(value.shape)) | ||
# loss_pi | # loss_pi | ||
ratios = torch.exp(logp_a - logp_old_a) | ratios = torch.exp(logp_a - logp_old_a) | ||
줄 236: | 줄 240: | ||
loss = loss_v * args.value_coef + loss_pi - args.ent_coef * entropy | loss = loss_v * args.value_coef + loss_pi - args.ent_coef * entropy | ||
- | | + | |
loss.backward() | loss.backward() | ||
torch.nn.utils.clip_grad_norm_(model.parameters(), | torch.nn.utils.clip_grad_norm_(model.parameters(), | ||
- | | + | |
# target 모델 교체 | # target 모델 교체 | ||
줄 249: | 줄 253: | ||
mb_j = mb_i % n_mini_batchs | mb_j = mb_i % n_mini_batchs | ||
sel = idx[mb_j * args.mini_batch_size: | sel = idx[mb_j * args.mini_batch_size: | ||
- | obs = torch.from_numpy(FD_obs[sel]).float() | + | obs = torch.tensor(FD_obs[sel], device=args.device).float() |
- | ret = torch.from_numpy(FD_ret[sel]).float() | + | ret = torch.tensor(FD_ret[sel], device=args.device).float() |
with torch.no_grad(): | with torch.no_grad(): | ||
- | _, logp_old, | + | _, logp_old, |
# 그래프 생성 | # 그래프 생성 | ||
- | value, logp, prob = model(obs, aux=True) | + | value, aux_value, logp, prob = model.aux_forward(obs) |
+ | # loss_v | ||
+ | # loss_v = F.smooth_l1_loss(value, | ||
+ | loss_v = F.mse_loss(value, | ||
# loss_aux_v | # loss_aux_v | ||
- | loss_aux_v = F.smooth_l1_loss(value, | + | |
+ | loss_aux_v = F.mse_loss(aux_value, ret.view(value.shape)) | ||
# loss_kld | # loss_kld | ||
- | kld = (logp_old.exp() | + | kld = (prob_old |
loss = loss_aux_v + args.clone_coef * kld | loss = loss_aux_v + args.clone_coef * kld | ||
- | | + | |
loss.backward() | loss.backward() | ||
torch.nn.utils.clip_grad_norm_(model.parameters(), | torch.nn.utils.clip_grad_norm_(model.parameters(), | ||
- | | + | |
+ | |||
+ | # target 모델 교체 | ||
+ | model_old.load_state_dict(model.state_dict()) | ||
# 학습결과 출력 | # 학습결과 출력 |
example/ppg.1603115786.txt.gz · 마지막으로 수정됨: 2024/03/23 02:38 (바깥 편집)