문서의 이전 판입니다!
MO-MPO 예제

# 참고자료
# - Abdolmaleki, Abbas, et al. 
#   "A Distributional View on Multi-Objective Policy Optimization." 
#   arXiv preprint arXiv:2005.07513 (2020).
# - https://arxiv.org/abs/2005.07513
# - https://github.com/theogruner/rl_pro_telu/blob/master/mpo/mpo.py
 
import matplotlib.pyplot as plt
import numpy as np
import tqdm
from IPython import embed
from scipy import optimize
 
 
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()
 
 
def train(p1, p2, morl, n_iters, verbose):
 
    n_actions = 3
    n_objs = 2
    B = 32
    payoff = np.array([[1, 20], [3, 3], [4, 1]], dtype=np.float)
 
    policy = np.random.random(n_actions)
    critics = np.random.random((n_actions, n_objs))
    eps = np.array([p1, p2])
    etas = np.random.random(n_objs)
 
    for i in range(n_iters):
        # 게임 플레이 및 데이터 수집
        prob = softmax(policy)
        logp = np.log(prob)
        entropy = -(prob * logp).sum()
        actions = np.random.choice(np.arange(n_actions), B, p=prob)
        rew = payoff[actions]
 
        # critic 학습
        q_values = critics[actions]
        v_loss = 0.5 * (rew - q_values) ** 2
        critics[actions] += 0.1 * v_loss
 
        # 각 목표에 대한 action 분포 계산
        # https://github.com/theogruner/rl_pro_telu/blob/master/mpo/mpo.py
        for k in range(n_objs):
            def dual(eta):
                return eta * eps[k] + eta * np.log(np.mean(np.exp(q_values[:, k] / eta), 0))
 
            bounds = [(1e-6, None)]
            res = optimize.minimize(dual, etas[k], method='SLSQP', bounds=bounds)
            etas[k] = res.x[0]
 
        # policy 업데이트
        if morl:
            target_q = 1 * q_values / etas  # 1: action_value
            # target_q = np.clip(target_q, a_min=-1, a_max=1)
        else:
            target_q = eps * rew
 
        target_q = (target_q - target_q.mean()) / target_q.std()
 
        logp_a = logp[actions]
        p_loss = -(target_q.sum(1) * logp_a) - 0.001 * entropy
        policy[actions] = 0.999 * policy[actions] + 0.01 * p_loss
 
        if verbose:
            print(i, rew.mean(0), v_loss.mean(), p_loss.mean(), entropy)
 
    return softmax(policy), rew.mean(0), entropy
 
 
if __name__ == '__main__':
 
    n_exprs = 250
    n_iters = 250
 
    scores = list()
    probs = list()
    entropy = list()
    for i in tqdm.trange(n_exprs):
        if i / n_exprs < 0.5:
            p1 = 0.01
            p2 = 0.01 * (2.0 * i / n_exprs)
        else:
            p1 = 0.01 * (2.0 * (i / n_exprs - 0.5))
            p2 = 0.01
        prob, score, ent = train(p1, p2, morl=True, n_iters=n_iters, verbose=False)
        scores.append(score)
        probs.append(prob)
        entropy.append(ent)
    morl_score = zip(*scores)
    morl_prob = np.vstack(probs)
    morl_entropy = entropy
 
    scores = list()
    probs = list()
    entropy = list()
    w1s = np.random.random(n_exprs)
    for w1, w2 in tqdm.tqdm(zip(w1s, 1-w1s), total=n_exprs):
        prob, score, ent = train(w1, w2, morl=False, n_iters=n_iters, verbose=False)
        scores.append(score)
        probs.append(prob)
        entropy.append(ent)
    rs_score = zip(*scores)
    rs_prob = np.vstack(probs)
    rs_entropy = entropy
 
    fig, axs = plt.subplots(2, 2)
    axs[0, 0].scatter(*morl_score, alpha=0.2)
    axs[0, 0].set_title('MO rewards')
    axs[0, 1].scatter(*rs_score, alpha=0.2)
    axs[0, 1].set_title('RS rewards')
    axs[1, 0].hist(morl_entropy, bins=25)
    axs[1, 0].set_title('MO entropy')
    axs[1, 1].hist(rs_entropy, bins=25)
    axs[1, 1].set_title('RS entropy')
    plt.savefig('mo_vs_rs.png')