Add stats window argument (#171)

* added missing tensorboard_log docstring

* added stats_window_size argument to all models

* changelog updated

* Update SB3 version

* fixed passing stats_window_size to parent

* added test of stats_window_size

---------

Co-authored-by: Antonin Raffin <antonin.raffin@dlr.de>
This commit is contained in:
Jonas Reiher 2023-04-05 18:47:27 +02:00 committed by GitHub
parent ce115982aa
commit aacded79c5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 41 additions and 4 deletions

View File

@ -3,7 +3,7 @@
Changelog
==========
Release 1.8.0a13 (WIP)
Release 1.8.0a14 (WIP)
--------------------------
Breaking Changes:
@ -13,6 +13,7 @@ Breaking Changes:
New Features:
^^^^^^^^^^^^^
- Added ``stats_window_size`` argument to control smoothing in rollout logging (@jonasreiher)
Bug Fixes:
^^^^^^^^^^
@ -408,4 +409,4 @@ Contributors:
-------------
@ku2482 @guyk1971 @minhlong94 @ayeright @kronion @glmcdona @cyprienc @sgillen @Gregwar @rnederstigt @qgallouedec
@mlodel @CppMaster @burakdmb @honglu2875 @ZikangXiong @AlexPasqua
@mlodel @CppMaster @burakdmb @honglu2875 @ZikangXiong @AlexPasqua @jonasreiher

View File

@ -41,6 +41,8 @@ class ARS(BaseAlgorithm):
:param alive_bonus_offset: Constant added to the reward at each step, used to cancel out alive bonuses.
:param n_eval_episodes: Number of episodes to evaluate each candidate.
:param policy_kwargs: Keyword arguments to pass to the policy on creation
:param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
the reported success rate, mean episode length, and mean reward over
:param tensorboard_log: String with the directory to put tensorboard logs:
:param seed: Random seed for the training
:param verbose: Verbosity level: 0 no output, 1 info, 2 debug
@ -65,6 +67,7 @@ class ARS(BaseAlgorithm):
alive_bonus_offset: float = 0,
n_eval_episodes: int = 1,
policy_kwargs: Optional[Dict[str, Any]] = None,
stats_window_size: int = 100,
tensorboard_log: Optional[str] = None,
seed: Optional[int] = None,
verbose: int = 0,
@ -75,6 +78,7 @@ class ARS(BaseAlgorithm):
policy,
env,
learning_rate=learning_rate,
stats_window_size=stats_window_size,
tensorboard_log=tensorboard_log,
policy_kwargs=policy_kwargs,
verbose=verbose,

View File

@ -58,6 +58,8 @@ class MaskablePPO(OnPolicyAlgorithm):
because the clipping is not enough to prevent large update
see issue #213 (cf https://github.com/hill-a/stable-baselines/issues/213)
By default, there is no limit on the kl div.
:param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
the reported success rate, mean episode length, and mean reward over
:param tensorboard_log: the log location for tensorboard (if None, no logging)
:param policy_kwargs: additional arguments to be passed to the policy on creation
:param verbose: the verbosity level: 0 no output, 1 info, 2 debug
@ -90,6 +92,7 @@ class MaskablePPO(OnPolicyAlgorithm):
vf_coef: float = 0.5,
max_grad_norm: float = 0.5,
target_kl: Optional[float] = None,
stats_window_size: int = 100,
tensorboard_log: Optional[str] = None,
policy_kwargs: Optional[Dict[str, Any]] = None,
verbose: int = 0,
@ -109,6 +112,7 @@ class MaskablePPO(OnPolicyAlgorithm):
max_grad_norm=max_grad_norm,
use_sde=False,
sde_sample_freq=-1,
stats_window_size=stats_window_size,
tensorboard_log=tensorboard_log,
policy_kwargs=policy_kwargs,
verbose=verbose,

View File

@ -56,6 +56,8 @@ class RecurrentPPO(OnPolicyAlgorithm):
because the clipping is not enough to prevent large update
see issue #213 (cf https://github.com/hill-a/stable-baselines/issues/213)
By default, there is no limit on the kl div.
:param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
the reported success rate, mean episode length, and mean reward over
:param tensorboard_log: the log location for tensorboard (if None, no logging)
:param policy_kwargs: additional arguments to be passed to the policy on creation
:param verbose: the verbosity level: 0 no output, 1 info, 2 debug
@ -90,6 +92,7 @@ class RecurrentPPO(OnPolicyAlgorithm):
use_sde: bool = False,
sde_sample_freq: int = -1,
target_kl: Optional[float] = None,
stats_window_size: int = 100,
tensorboard_log: Optional[str] = None,
policy_kwargs: Optional[Dict[str, Any]] = None,
verbose: int = 0,
@ -109,6 +112,7 @@ class RecurrentPPO(OnPolicyAlgorithm):
max_grad_norm=max_grad_norm,
use_sde=use_sde,
sde_sample_freq=sde_sample_freq,
stats_window_size=stats_window_size,
tensorboard_log=tensorboard_log,
policy_kwargs=policy_kwargs,
verbose=verbose,

View File

@ -48,6 +48,8 @@ class QRDQN(OffPolicyAlgorithm):
:param exploration_initial_eps: initial value of random action probability
:param exploration_final_eps: final value of random action probability
:param max_grad_norm: The maximum value for the gradient clipping (if None, no clipping)
:param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
the reported success rate, mean episode length, and mean reward over
:param tensorboard_log: the log location for tensorboard (if None, no logging)
:param policy_kwargs: additional arguments to be passed to the policy on creation
:param verbose: the verbosity level: 0 no output, 1 info, 2 debug
@ -83,6 +85,7 @@ class QRDQN(OffPolicyAlgorithm):
exploration_initial_eps: float = 1.0,
exploration_final_eps: float = 0.01,
max_grad_norm: Optional[float] = None,
stats_window_size: int = 100,
tensorboard_log: Optional[str] = None,
policy_kwargs: Optional[Dict[str, Any]] = None,
verbose: int = 0,
@ -105,6 +108,7 @@ class QRDQN(OffPolicyAlgorithm):
replay_buffer_class=replay_buffer_class,
replay_buffer_kwargs=replay_buffer_kwargs,
policy_kwargs=policy_kwargs,
stats_window_size=stats_window_size,
tensorboard_log=tensorboard_log,
verbose=verbose,
device=device,

View File

@ -57,6 +57,9 @@ class TQC(OffPolicyAlgorithm):
Default: -1 (only sample at the beginning of the rollout)
:param use_sde_at_warmup: Whether to use gSDE instead of uniform sampling
during the warm up phase (before learning starts)
:param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
the reported success rate, mean episode length, and mean reward over
:param tensorboard_log: the log location for tensorboard (if None, no logging)
:param policy_kwargs: additional arguments to be passed to the policy on creation
:param verbose: the verbosity level: 0 no output, 1 info, 2 debug
:param seed: Seed for the pseudo random generators
@ -94,6 +97,7 @@ class TQC(OffPolicyAlgorithm):
use_sde: bool = False,
sde_sample_freq: int = -1,
use_sde_at_warmup: bool = False,
stats_window_size: int = 100,
tensorboard_log: Optional[str] = None,
policy_kwargs: Optional[Dict[str, Any]] = None,
verbose: int = 0,
@ -116,6 +120,7 @@ class TQC(OffPolicyAlgorithm):
replay_buffer_class=replay_buffer_class,
replay_buffer_kwargs=replay_buffer_kwargs,
policy_kwargs=policy_kwargs,
stats_window_size=stats_window_size,
tensorboard_log=tensorboard_log,
verbose=verbose,
device=device,

View File

@ -58,6 +58,8 @@ class TRPO(OnPolicyAlgorithm):
Should be small for stability. Values like 0.01, 0.05.
:param sub_sampling_factor: Sub-sample the batch to make computation faster
see p40-42 of John Schulman thesis http://joschu.net/docs/thesis.pdf
:param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
the reported success rate, mean episode length, and mean reward over
:param tensorboard_log: the log location for tensorboard (if None, no logging)
:param policy_kwargs: additional arguments to be passed to the policy on creation
:param verbose: the verbosity level: 0 no output, 1 info, 2 debug
@ -92,6 +94,7 @@ class TRPO(OnPolicyAlgorithm):
normalize_advantage: bool = True,
target_kl: float = 0.01,
sub_sampling_factor: int = 1,
stats_window_size: int = 100,
tensorboard_log: Optional[str] = None,
policy_kwargs: Optional[Dict[str, Any]] = None,
verbose: int = 0,
@ -111,6 +114,7 @@ class TRPO(OnPolicyAlgorithm):
max_grad_norm=0.0,
use_sde=use_sde,
sde_sample_freq=sde_sample_freq,
stats_window_size=stats_window_size,
tensorboard_log=tensorboard_log,
policy_kwargs=policy_kwargs,
verbose=verbose,

View File

@ -1 +1 @@
1.8.0a13
1.8.0a14

View File

@ -65,7 +65,7 @@ setup(
packages=[package for package in find_packages() if package.startswith("sb3_contrib")],
package_data={"sb3_contrib": ["py.typed", "version.txt"]},
install_requires=[
"stable_baselines3>=1.8.0a13",
"stable_baselines3>=1.8.0a14",
],
description="Contrib package of Stable Baselines3, experimental code.",
author="Antonin Raffin",

View File

@ -148,3 +148,14 @@ def test_advantage_normalization(normalize_advantage):
env = InvalidActionEnvDiscrete(dim=80, n_invalid_actions=60)
model = MaskablePPO("MlpPolicy", env, n_steps=64, normalize_advantage=normalize_advantage)
model.learn(64)
@pytest.mark.parametrize("algo", [TRPO, QRDQN])
@pytest.mark.parametrize("stats_window_size", [1, 42])
def test_ep_buffers_stats_window_size(algo, stats_window_size):
"""Set stats_window_size for logging to non-default value and check if
ep_info_buffer and ep_success_buffer are initialized to the correct length"""
model = algo("MlpPolicy", "CartPole-v1", stats_window_size=stats_window_size)
model.learn(total_timesteps=10)
assert model.ep_info_buffer.maxlen == stats_window_size
assert model.ep_success_buffer.maxlen == stats_window_size