mo-mpo:example
MO-MPO 예제
- mo-mpo-simple.py
# 참고자료 # - Abdolmaleki, Abbas, et al. # "A Distributional View on Multi-Objective Policy Optimization." # arXiv preprint arXiv:2005.07513 (2020). # - https://arxiv.org/abs/2005.07513 # - https://github.com/theogruner/rl_pro_telu/blob/master/mpo/mpo.py import matplotlib.pyplot as plt import numpy as np import tqdm from IPython import embed from scipy import optimize def softmax(x): e_x = np.exp(x - np.max(x)) return e_x / e_x.sum() def train(p1, p2, morl, n_iters, verbose): n_actions = 3 n_objs = 2 B = 32 payoff = np.array([[1, 20], [3, 3], [4, 1]], dtype=np.float) policy = np.random.random(n_actions) critics = np.random.random((n_actions, n_objs)) eps = np.array([p1, p2]) etas = np.random.random(n_objs) for i in range(n_iters): # 게임 플레이 및 데이터 수집 prob = softmax(policy) logp = np.log(prob) entropy = -(prob * logp).sum() actions = np.random.choice(np.arange(n_actions), B, p=prob) rew = payoff[actions] # critic 학습 q_values = critics[actions] v_loss = 0.5 * (rew - q_values) ** 2 critics[actions] += 0.1 * v_loss # 각 목표에 대한 action 분포 계산 # https://github.com/theogruner/rl_pro_telu/blob/master/mpo/mpo.py for k in range(n_objs): def dual(eta): return eta * eps[k] + eta * np.log(np.mean(np.exp(q_values[:, k] / eta), 0)) bounds = [(1e-6, None)] res = optimize.minimize(dual, etas[k], method='SLSQP', bounds=bounds) etas[k] = res.x[0] # policy 업데이트 if morl: target_q = 1 * q_values / etas # 1: action_value # target_q = np.clip(target_q, a_min=-1, a_max=1) else: target_q = eps * rew target_q = (target_q - target_q.mean()) / target_q.std() logp_a = logp[actions] p_loss = -(target_q.sum(1) * logp_a) - 0.001 * entropy policy[actions] = 0.999 * policy[actions] + 0.01 * p_loss if verbose: print(i, rew.mean(0), v_loss.mean(), p_loss.mean(), entropy) return softmax(policy), rew.mean(0), entropy if __name__ == '__main__': n_exprs = 250 n_iters = 250 scores = list() probs = list() entropy = list() for i in tqdm.trange(n_exprs): if i / n_exprs < 0.5: p1 = 0.01 p2 = 0.01 * (2.0 * i / n_exprs) else: p1 = 0.01 * (2.0 * (i / n_exprs - 0.5)) p2 = 0.01 prob, score, ent = train(p1, p2, morl=True, n_iters=n_iters, verbose=False) scores.append(score) probs.append(prob) entropy.append(ent) morl_score = zip(*scores) morl_prob = np.vstack(probs) morl_entropy = entropy scores = list() probs = list() entropy = list() w1s = np.random.random(n_exprs) for w1, w2 in tqdm.tqdm(zip(w1s, 1-w1s), total=n_exprs): prob, score, ent = train(w1, w2, morl=False, n_iters=n_iters, verbose=False) scores.append(score) probs.append(prob) entropy.append(ent) rs_score = zip(*scores) rs_prob = np.vstack(probs) rs_entropy = entropy fig, axs = plt.subplots(2, 2) axs[0, 0].scatter(*morl_score, alpha=0.2) axs[0, 0].set_title('MO rewards') axs[0, 1].scatter(*rs_score, alpha=0.2) axs[0, 1].set_title('RS rewards') axs[1, 0].hist(morl_entropy, bins=25) axs[1, 0].set_title('MO entropy') axs[1, 1].hist(rs_entropy, bins=25) axs[1, 1].set_title('RS entropy') plt.savefig('mo_vs_rs.png')
mo-mpo/example.txt · 마지막으로 수정됨: 2024/03/23 02:42 저자 127.0.0.1