diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index 45c32ac..7465acd 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -3,7 +3,7 @@ Changelog ========== -Release 1.8.0a13 (WIP) +Release 1.8.0a14 (WIP) -------------------------- Breaking Changes: @@ -13,6 +13,7 @@ Breaking Changes: New Features: ^^^^^^^^^^^^^ +- Added ``stats_window_size`` argument to control smoothing in rollout logging (@jonasreiher) Bug Fixes: ^^^^^^^^^^ @@ -408,4 +409,4 @@ Contributors: ------------- @ku2482 @guyk1971 @minhlong94 @ayeright @kronion @glmcdona @cyprienc @sgillen @Gregwar @rnederstigt @qgallouedec -@mlodel @CppMaster @burakdmb @honglu2875 @ZikangXiong @AlexPasqua +@mlodel @CppMaster @burakdmb @honglu2875 @ZikangXiong @AlexPasqua @jonasreiher diff --git a/sb3_contrib/ars/ars.py b/sb3_contrib/ars/ars.py index e5ea9ce..eae759c 100644 --- a/sb3_contrib/ars/ars.py +++ b/sb3_contrib/ars/ars.py @@ -41,6 +41,8 @@ class ARS(BaseAlgorithm): :param alive_bonus_offset: Constant added to the reward at each step, used to cancel out alive bonuses. :param n_eval_episodes: Number of episodes to evaluate each candidate. :param policy_kwargs: Keyword arguments to pass to the policy on creation + :param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average + the reported success rate, mean episode length, and mean reward over :param tensorboard_log: String with the directory to put tensorboard logs: :param seed: Random seed for the training :param verbose: Verbosity level: 0 no output, 1 info, 2 debug @@ -65,6 +67,7 @@ class ARS(BaseAlgorithm): alive_bonus_offset: float = 0, n_eval_episodes: int = 1, policy_kwargs: Optional[Dict[str, Any]] = None, + stats_window_size: int = 100, tensorboard_log: Optional[str] = None, seed: Optional[int] = None, verbose: int = 0, @@ -75,6 +78,7 @@ class ARS(BaseAlgorithm): policy, env, learning_rate=learning_rate, + stats_window_size=stats_window_size, tensorboard_log=tensorboard_log, policy_kwargs=policy_kwargs, verbose=verbose, diff --git a/sb3_contrib/ppo_mask/ppo_mask.py b/sb3_contrib/ppo_mask/ppo_mask.py index ee88f12..d27c724 100644 --- a/sb3_contrib/ppo_mask/ppo_mask.py +++ b/sb3_contrib/ppo_mask/ppo_mask.py @@ -58,6 +58,8 @@ class MaskablePPO(OnPolicyAlgorithm): because the clipping is not enough to prevent large update see issue #213 (cf https://github.com/hill-a/stable-baselines/issues/213) By default, there is no limit on the kl div. + :param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average + the reported success rate, mean episode length, and mean reward over :param tensorboard_log: the log location for tensorboard (if None, no logging) :param policy_kwargs: additional arguments to be passed to the policy on creation :param verbose: the verbosity level: 0 no output, 1 info, 2 debug @@ -90,6 +92,7 @@ class MaskablePPO(OnPolicyAlgorithm): vf_coef: float = 0.5, max_grad_norm: float = 0.5, target_kl: Optional[float] = None, + stats_window_size: int = 100, tensorboard_log: Optional[str] = None, policy_kwargs: Optional[Dict[str, Any]] = None, verbose: int = 0, @@ -109,6 +112,7 @@ class MaskablePPO(OnPolicyAlgorithm): max_grad_norm=max_grad_norm, use_sde=False, sde_sample_freq=-1, + stats_window_size=stats_window_size, tensorboard_log=tensorboard_log, policy_kwargs=policy_kwargs, verbose=verbose, diff --git a/sb3_contrib/ppo_recurrent/ppo_recurrent.py b/sb3_contrib/ppo_recurrent/ppo_recurrent.py index 78d67d9..344fbda 100644 --- a/sb3_contrib/ppo_recurrent/ppo_recurrent.py +++ b/sb3_contrib/ppo_recurrent/ppo_recurrent.py @@ -56,6 +56,8 @@ class RecurrentPPO(OnPolicyAlgorithm): because the clipping is not enough to prevent large update see issue #213 (cf https://github.com/hill-a/stable-baselines/issues/213) By default, there is no limit on the kl div. + :param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average + the reported success rate, mean episode length, and mean reward over :param tensorboard_log: the log location for tensorboard (if None, no logging) :param policy_kwargs: additional arguments to be passed to the policy on creation :param verbose: the verbosity level: 0 no output, 1 info, 2 debug @@ -90,6 +92,7 @@ class RecurrentPPO(OnPolicyAlgorithm): use_sde: bool = False, sde_sample_freq: int = -1, target_kl: Optional[float] = None, + stats_window_size: int = 100, tensorboard_log: Optional[str] = None, policy_kwargs: Optional[Dict[str, Any]] = None, verbose: int = 0, @@ -109,6 +112,7 @@ class RecurrentPPO(OnPolicyAlgorithm): max_grad_norm=max_grad_norm, use_sde=use_sde, sde_sample_freq=sde_sample_freq, + stats_window_size=stats_window_size, tensorboard_log=tensorboard_log, policy_kwargs=policy_kwargs, verbose=verbose, diff --git a/sb3_contrib/qrdqn/qrdqn.py b/sb3_contrib/qrdqn/qrdqn.py index 7e1785b..62c6a30 100644 --- a/sb3_contrib/qrdqn/qrdqn.py +++ b/sb3_contrib/qrdqn/qrdqn.py @@ -48,6 +48,8 @@ class QRDQN(OffPolicyAlgorithm): :param exploration_initial_eps: initial value of random action probability :param exploration_final_eps: final value of random action probability :param max_grad_norm: The maximum value for the gradient clipping (if None, no clipping) + :param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average + the reported success rate, mean episode length, and mean reward over :param tensorboard_log: the log location for tensorboard (if None, no logging) :param policy_kwargs: additional arguments to be passed to the policy on creation :param verbose: the verbosity level: 0 no output, 1 info, 2 debug @@ -83,6 +85,7 @@ class QRDQN(OffPolicyAlgorithm): exploration_initial_eps: float = 1.0, exploration_final_eps: float = 0.01, max_grad_norm: Optional[float] = None, + stats_window_size: int = 100, tensorboard_log: Optional[str] = None, policy_kwargs: Optional[Dict[str, Any]] = None, verbose: int = 0, @@ -105,6 +108,7 @@ class QRDQN(OffPolicyAlgorithm): replay_buffer_class=replay_buffer_class, replay_buffer_kwargs=replay_buffer_kwargs, policy_kwargs=policy_kwargs, + stats_window_size=stats_window_size, tensorboard_log=tensorboard_log, verbose=verbose, device=device, diff --git a/sb3_contrib/tqc/tqc.py b/sb3_contrib/tqc/tqc.py index b6ea3cd..5d8889f 100644 --- a/sb3_contrib/tqc/tqc.py +++ b/sb3_contrib/tqc/tqc.py @@ -57,6 +57,9 @@ class TQC(OffPolicyAlgorithm): Default: -1 (only sample at the beginning of the rollout) :param use_sde_at_warmup: Whether to use gSDE instead of uniform sampling during the warm up phase (before learning starts) + :param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average + the reported success rate, mean episode length, and mean reward over + :param tensorboard_log: the log location for tensorboard (if None, no logging) :param policy_kwargs: additional arguments to be passed to the policy on creation :param verbose: the verbosity level: 0 no output, 1 info, 2 debug :param seed: Seed for the pseudo random generators @@ -94,6 +97,7 @@ class TQC(OffPolicyAlgorithm): use_sde: bool = False, sde_sample_freq: int = -1, use_sde_at_warmup: bool = False, + stats_window_size: int = 100, tensorboard_log: Optional[str] = None, policy_kwargs: Optional[Dict[str, Any]] = None, verbose: int = 0, @@ -116,6 +120,7 @@ class TQC(OffPolicyAlgorithm): replay_buffer_class=replay_buffer_class, replay_buffer_kwargs=replay_buffer_kwargs, policy_kwargs=policy_kwargs, + stats_window_size=stats_window_size, tensorboard_log=tensorboard_log, verbose=verbose, device=device, diff --git a/sb3_contrib/trpo/trpo.py b/sb3_contrib/trpo/trpo.py index 2a7986d..d97cefd 100644 --- a/sb3_contrib/trpo/trpo.py +++ b/sb3_contrib/trpo/trpo.py @@ -58,6 +58,8 @@ class TRPO(OnPolicyAlgorithm): Should be small for stability. Values like 0.01, 0.05. :param sub_sampling_factor: Sub-sample the batch to make computation faster see p40-42 of John Schulman thesis http://joschu.net/docs/thesis.pdf + :param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average + the reported success rate, mean episode length, and mean reward over :param tensorboard_log: the log location for tensorboard (if None, no logging) :param policy_kwargs: additional arguments to be passed to the policy on creation :param verbose: the verbosity level: 0 no output, 1 info, 2 debug @@ -92,6 +94,7 @@ class TRPO(OnPolicyAlgorithm): normalize_advantage: bool = True, target_kl: float = 0.01, sub_sampling_factor: int = 1, + stats_window_size: int = 100, tensorboard_log: Optional[str] = None, policy_kwargs: Optional[Dict[str, Any]] = None, verbose: int = 0, @@ -111,6 +114,7 @@ class TRPO(OnPolicyAlgorithm): max_grad_norm=0.0, use_sde=use_sde, sde_sample_freq=sde_sample_freq, + stats_window_size=stats_window_size, tensorboard_log=tensorboard_log, policy_kwargs=policy_kwargs, verbose=verbose, diff --git a/sb3_contrib/version.txt b/sb3_contrib/version.txt index e9033cc..ee006ce 100644 --- a/sb3_contrib/version.txt +++ b/sb3_contrib/version.txt @@ -1 +1 @@ -1.8.0a13 +1.8.0a14 diff --git a/setup.py b/setup.py index b3a17cc..fd78860 100644 --- a/setup.py +++ b/setup.py @@ -65,7 +65,7 @@ setup( packages=[package for package in find_packages() if package.startswith("sb3_contrib")], package_data={"sb3_contrib": ["py.typed", "version.txt"]}, install_requires=[ - "stable_baselines3>=1.8.0a13", + "stable_baselines3>=1.8.0a14", ], description="Contrib package of Stable Baselines3, experimental code.", author="Antonin Raffin", diff --git a/tests/test_run.py b/tests/test_run.py index 6753ebb..22c568b 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -148,3 +148,14 @@ def test_advantage_normalization(normalize_advantage): env = InvalidActionEnvDiscrete(dim=80, n_invalid_actions=60) model = MaskablePPO("MlpPolicy", env, n_steps=64, normalize_advantage=normalize_advantage) model.learn(64) + + +@pytest.mark.parametrize("algo", [TRPO, QRDQN]) +@pytest.mark.parametrize("stats_window_size", [1, 42]) +def test_ep_buffers_stats_window_size(algo, stats_window_size): + """Set stats_window_size for logging to non-default value and check if + ep_info_buffer and ep_success_buffer are initialized to the correct length""" + model = algo("MlpPolicy", "CartPole-v1", stats_window_size=stats_window_size) + model.learn(total_timesteps=10) + assert model.ep_info_buffer.maxlen == stats_window_size + assert model.ep_success_buffer.maxlen == stats_window_size