exp_config = { 'type': 'ppo', 'on_policy': True, 'cuda': True, 'action_space': 'discrete', 'discount_factor': 0.99, 'gae_lambda': 0.95, 'epoch_per_collect': 10, 'batch_size': 320, 'learning_rate': 0.0005, 'lr_scheduler': [1000, 0.1], 'weight_decay': 0, 'value_weight': 0.5, 'entropy_weight': 0.01, 'clip_ratio': 0.2, 'adv_norm': True, 'value_norm': 'baseline', 'ppo_param_init': True, 'grad_norm': 0.5, 'n_sample': 3200, 'unroll_len': 1, 'deterministic_eval': True, 'model': { 'encoder_hidden_size_list': [64, 64, 128], 'actor_head_hidden_size': 128, 'critic_head_hidden_size': 128 }, 'cfg_type': 'PPOFPolicyDict', 'env_id': 'QbertNoFrameskip-v4', 'exp_name': 'QbertNoFrameskip-v4-PPO' }