Demo of PPO ConfigurationΒΆ

This is the original code.

This is the version based on argsloader.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
from enum import Enum, unique
from pprint import pprint

from argsloader.units import cdict, enum, cvalue, yesno, number, onoff, positive, interval, optional


@unique
class PolicyType(Enum):
    PPO = 1


@unique
class ActionSpaceType(Enum):
    DISCRETE = 1
    CONTINUOUS = 2
    HYBRID = 3


@unique
class GradClipType(Enum):
    NONE = 1
    CLIP_MOMENTUM = 2
    CLIP_VALUE = 3
    CLIP_NORM = 4
    CLIP_MOMENTUM_NORM = 5


config_loader = cdict(dict(
    # (str) RL policy register name (refer to function "POLICY_REGISTRY").
    type=cvalue('ppo', enum(PolicyType)),
    # (bool) Whether to use cuda for network.
    cuda=cvalue(False, yesno() | onoff()),
    # (bool) Whether the RL algorithm is on-policy or off-policy. (Note: in practice PPO can be off-policy used)
    on_policy=cvalue(True, yesno()),
    # (bool) Whether to use priority(priority sample, IS weight, update priority)
    priority=cvalue(False, yesno()),
    # (bool) Whether to use Importance Sampling Weight to correct biased update due to priority.
    # If True, priority must be True.
    priority_IS_weight=cvalue(False, yesno()),
    # (bool) Whether to recompurete advantages in each iteration of on-policy PPO
    recompute_adv=cvalue(True, yesno()),
    # (str) Which kind of action space used in PPOPolicy, ['discrete', 'continuous', 'hybrid']
    action_space=cvalue('discrete', enum(ActionSpaceType)),
    # (bool) Whether to use nstep return to calculate value target, otherwise, use return = adv + value
    nstep_return=cvalue(False, yesno()),
    # (bool) Whether to enable multi-agent training, i.e.: MAPPO
    multi_agent=cvalue(False, yesno()),
    # (bool) Whether to need policy data in process transition
    transition_with_policy_data=cvalue(True, yesno()),
    learn=dict(
        # (bool) Whether to use multi gpu
        multi_gpu=cvalue(False, yesno()),
        epoch_per_collect=cvalue(10, number() >> positive.int()),
        batch_size=cvalue(64, number() >> positive.int()),
        learning_rate=cvalue(3e-4, number()),
        # ==============================================================
        # The following configs is algorithm-specific
        # ==============================================================
        # (float) The loss weight of value network, policy network weight is set to 1
        value_weight=cvalue(0.5, number()),
        # (float) The loss weight of entropy regularization, policy network weight is set to 1
        entropy_weight=cvalue(0.0, number()),
        # (float) PPO clip ratio, defaults to 0.2
        clip_ratio=cvalue(0.2, number() >> interval.LR(0, 1)),
        # (bool) Whether to use advantage norm in a whole training batch
        adv_norm=cvalue(True, yesno()),
        value_norm=cvalue(True, yesno()),
        ppo_param_init=cvalue(True, yesno()),
        grad_clip_type=cvalue('clip_norm', optional(enum(GradClipType))),
        grad_clip_value=cvalue(0.5, number()),
        ignore_done=cvalue(False, yesno()),
    ),
    collect=dict(
        # (int) Only one of [n_sample, n_episode] shoule be set
        n_sample=cvalue(64, number() >> positive.int()),
        # (int) Cut trajectories into pieces with length "unroll_len".
        unroll_len=cvalue(1, number() >> positive.int()),
        # ==============================================================
        # The following configs is algorithm-specific
        # ==============================================================
        # (float) Reward's future discount factor, aka. gamma.
        discount_factor=cvalue(0.99, number() >> interval.LR(0, 1)),
        # (float) GAE lambda factor for the balance of bias and variance(1-step td and mc)
        gae_lambda=cvalue(0.95, number() >> interval.LR(0, 1)),
    ),
    eval=dict(),
))

if __name__ == '__main__':
    pprint(config_loader.call({
        'learn': {
            'grad_clip_type': 'clip_value',
        },
        'collect': {
            'n_sample': 320,
            'unroll_len': 1,
            'discount_factor': 0.95,
        },
        'cuda': 'on',
    }), indent=4)

The result should be

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
{   'action_space': <ActionSpaceType.DISCRETE: 1>,
    'collect': {   'discount_factor': 0.95,
                   'gae_lambda': 0.95,
                   'n_sample': 320,
                   'unroll_len': 1},
    'cuda': True,
    'eval': {},
    'learn': {   'adv_norm': True,
                 'batch_size': 64,
                 'clip_ratio': 0.2,
                 'entropy_weight': 0.0,
                 'epoch_per_collect': 10,
                 'grad_clip_type': <GradClipType.CLIP_VALUE: 3>,
                 'grad_clip_value': 0.5,
                 'ignore_done': False,
                 'learning_rate': 0.0003,
                 'multi_gpu': False,
                 'ppo_param_init': True,
                 'value_norm': True,
                 'value_weight': 0.5},
    'multi_agent': False,
    'nstep_return': False,
    'on_policy': True,
    'priority': False,
    'priority_IS_weight': False,
    'recompute_adv': True,
    'transition_with_policy_data': True,
    'type': <PolicyType.PPO: 1>}