Add stats window argument (#171)
* added missing tensorboard_log docstring * added stats_window_size argument to all models * changelog updated * Update SB3 version * fixed passing stats_window_size to parent * added test of stats_window_size --------- Co-authored-by: Antonin Raffin <antonin.raffin@dlr.de>
This commit is contained in:
parent
ce115982aa
commit
aacded79c5
|
|
@ -3,7 +3,7 @@
|
|||
Changelog
|
||||
==========
|
||||
|
||||
Release 1.8.0a13 (WIP)
|
||||
Release 1.8.0a14 (WIP)
|
||||
--------------------------
|
||||
|
||||
Breaking Changes:
|
||||
|
|
@ -13,6 +13,7 @@ Breaking Changes:
|
|||
|
||||
New Features:
|
||||
^^^^^^^^^^^^^
|
||||
- Added ``stats_window_size`` argument to control smoothing in rollout logging (@jonasreiher)
|
||||
|
||||
Bug Fixes:
|
||||
^^^^^^^^^^
|
||||
|
|
@ -408,4 +409,4 @@ Contributors:
|
|||
-------------
|
||||
|
||||
@ku2482 @guyk1971 @minhlong94 @ayeright @kronion @glmcdona @cyprienc @sgillen @Gregwar @rnederstigt @qgallouedec
|
||||
@mlodel @CppMaster @burakdmb @honglu2875 @ZikangXiong @AlexPasqua
|
||||
@mlodel @CppMaster @burakdmb @honglu2875 @ZikangXiong @AlexPasqua @jonasreiher
|
||||
|
|
|
|||
|
|
@ -41,6 +41,8 @@ class ARS(BaseAlgorithm):
|
|||
:param alive_bonus_offset: Constant added to the reward at each step, used to cancel out alive bonuses.
|
||||
:param n_eval_episodes: Number of episodes to evaluate each candidate.
|
||||
:param policy_kwargs: Keyword arguments to pass to the policy on creation
|
||||
:param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
|
||||
the reported success rate, mean episode length, and mean reward over
|
||||
:param tensorboard_log: String with the directory to put tensorboard logs:
|
||||
:param seed: Random seed for the training
|
||||
:param verbose: Verbosity level: 0 no output, 1 info, 2 debug
|
||||
|
|
@ -65,6 +67,7 @@ class ARS(BaseAlgorithm):
|
|||
alive_bonus_offset: float = 0,
|
||||
n_eval_episodes: int = 1,
|
||||
policy_kwargs: Optional[Dict[str, Any]] = None,
|
||||
stats_window_size: int = 100,
|
||||
tensorboard_log: Optional[str] = None,
|
||||
seed: Optional[int] = None,
|
||||
verbose: int = 0,
|
||||
|
|
@ -75,6 +78,7 @@ class ARS(BaseAlgorithm):
|
|||
policy,
|
||||
env,
|
||||
learning_rate=learning_rate,
|
||||
stats_window_size=stats_window_size,
|
||||
tensorboard_log=tensorboard_log,
|
||||
policy_kwargs=policy_kwargs,
|
||||
verbose=verbose,
|
||||
|
|
|
|||
|
|
@ -58,6 +58,8 @@ class MaskablePPO(OnPolicyAlgorithm):
|
|||
because the clipping is not enough to prevent large update
|
||||
see issue #213 (cf https://github.com/hill-a/stable-baselines/issues/213)
|
||||
By default, there is no limit on the kl div.
|
||||
:param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
|
||||
the reported success rate, mean episode length, and mean reward over
|
||||
:param tensorboard_log: the log location for tensorboard (if None, no logging)
|
||||
:param policy_kwargs: additional arguments to be passed to the policy on creation
|
||||
:param verbose: the verbosity level: 0 no output, 1 info, 2 debug
|
||||
|
|
@ -90,6 +92,7 @@ class MaskablePPO(OnPolicyAlgorithm):
|
|||
vf_coef: float = 0.5,
|
||||
max_grad_norm: float = 0.5,
|
||||
target_kl: Optional[float] = None,
|
||||
stats_window_size: int = 100,
|
||||
tensorboard_log: Optional[str] = None,
|
||||
policy_kwargs: Optional[Dict[str, Any]] = None,
|
||||
verbose: int = 0,
|
||||
|
|
@ -109,6 +112,7 @@ class MaskablePPO(OnPolicyAlgorithm):
|
|||
max_grad_norm=max_grad_norm,
|
||||
use_sde=False,
|
||||
sde_sample_freq=-1,
|
||||
stats_window_size=stats_window_size,
|
||||
tensorboard_log=tensorboard_log,
|
||||
policy_kwargs=policy_kwargs,
|
||||
verbose=verbose,
|
||||
|
|
|
|||
|
|
@ -56,6 +56,8 @@ class RecurrentPPO(OnPolicyAlgorithm):
|
|||
because the clipping is not enough to prevent large update
|
||||
see issue #213 (cf https://github.com/hill-a/stable-baselines/issues/213)
|
||||
By default, there is no limit on the kl div.
|
||||
:param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
|
||||
the reported success rate, mean episode length, and mean reward over
|
||||
:param tensorboard_log: the log location for tensorboard (if None, no logging)
|
||||
:param policy_kwargs: additional arguments to be passed to the policy on creation
|
||||
:param verbose: the verbosity level: 0 no output, 1 info, 2 debug
|
||||
|
|
@ -90,6 +92,7 @@ class RecurrentPPO(OnPolicyAlgorithm):
|
|||
use_sde: bool = False,
|
||||
sde_sample_freq: int = -1,
|
||||
target_kl: Optional[float] = None,
|
||||
stats_window_size: int = 100,
|
||||
tensorboard_log: Optional[str] = None,
|
||||
policy_kwargs: Optional[Dict[str, Any]] = None,
|
||||
verbose: int = 0,
|
||||
|
|
@ -109,6 +112,7 @@ class RecurrentPPO(OnPolicyAlgorithm):
|
|||
max_grad_norm=max_grad_norm,
|
||||
use_sde=use_sde,
|
||||
sde_sample_freq=sde_sample_freq,
|
||||
stats_window_size=stats_window_size,
|
||||
tensorboard_log=tensorboard_log,
|
||||
policy_kwargs=policy_kwargs,
|
||||
verbose=verbose,
|
||||
|
|
|
|||
|
|
@ -48,6 +48,8 @@ class QRDQN(OffPolicyAlgorithm):
|
|||
:param exploration_initial_eps: initial value of random action probability
|
||||
:param exploration_final_eps: final value of random action probability
|
||||
:param max_grad_norm: The maximum value for the gradient clipping (if None, no clipping)
|
||||
:param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
|
||||
the reported success rate, mean episode length, and mean reward over
|
||||
:param tensorboard_log: the log location for tensorboard (if None, no logging)
|
||||
:param policy_kwargs: additional arguments to be passed to the policy on creation
|
||||
:param verbose: the verbosity level: 0 no output, 1 info, 2 debug
|
||||
|
|
@ -83,6 +85,7 @@ class QRDQN(OffPolicyAlgorithm):
|
|||
exploration_initial_eps: float = 1.0,
|
||||
exploration_final_eps: float = 0.01,
|
||||
max_grad_norm: Optional[float] = None,
|
||||
stats_window_size: int = 100,
|
||||
tensorboard_log: Optional[str] = None,
|
||||
policy_kwargs: Optional[Dict[str, Any]] = None,
|
||||
verbose: int = 0,
|
||||
|
|
@ -105,6 +108,7 @@ class QRDQN(OffPolicyAlgorithm):
|
|||
replay_buffer_class=replay_buffer_class,
|
||||
replay_buffer_kwargs=replay_buffer_kwargs,
|
||||
policy_kwargs=policy_kwargs,
|
||||
stats_window_size=stats_window_size,
|
||||
tensorboard_log=tensorboard_log,
|
||||
verbose=verbose,
|
||||
device=device,
|
||||
|
|
|
|||
|
|
@ -57,6 +57,9 @@ class TQC(OffPolicyAlgorithm):
|
|||
Default: -1 (only sample at the beginning of the rollout)
|
||||
:param use_sde_at_warmup: Whether to use gSDE instead of uniform sampling
|
||||
during the warm up phase (before learning starts)
|
||||
:param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
|
||||
the reported success rate, mean episode length, and mean reward over
|
||||
:param tensorboard_log: the log location for tensorboard (if None, no logging)
|
||||
:param policy_kwargs: additional arguments to be passed to the policy on creation
|
||||
:param verbose: the verbosity level: 0 no output, 1 info, 2 debug
|
||||
:param seed: Seed for the pseudo random generators
|
||||
|
|
@ -94,6 +97,7 @@ class TQC(OffPolicyAlgorithm):
|
|||
use_sde: bool = False,
|
||||
sde_sample_freq: int = -1,
|
||||
use_sde_at_warmup: bool = False,
|
||||
stats_window_size: int = 100,
|
||||
tensorboard_log: Optional[str] = None,
|
||||
policy_kwargs: Optional[Dict[str, Any]] = None,
|
||||
verbose: int = 0,
|
||||
|
|
@ -116,6 +120,7 @@ class TQC(OffPolicyAlgorithm):
|
|||
replay_buffer_class=replay_buffer_class,
|
||||
replay_buffer_kwargs=replay_buffer_kwargs,
|
||||
policy_kwargs=policy_kwargs,
|
||||
stats_window_size=stats_window_size,
|
||||
tensorboard_log=tensorboard_log,
|
||||
verbose=verbose,
|
||||
device=device,
|
||||
|
|
|
|||
|
|
@ -58,6 +58,8 @@ class TRPO(OnPolicyAlgorithm):
|
|||
Should be small for stability. Values like 0.01, 0.05.
|
||||
:param sub_sampling_factor: Sub-sample the batch to make computation faster
|
||||
see p40-42 of John Schulman thesis http://joschu.net/docs/thesis.pdf
|
||||
:param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
|
||||
the reported success rate, mean episode length, and mean reward over
|
||||
:param tensorboard_log: the log location for tensorboard (if None, no logging)
|
||||
:param policy_kwargs: additional arguments to be passed to the policy on creation
|
||||
:param verbose: the verbosity level: 0 no output, 1 info, 2 debug
|
||||
|
|
@ -92,6 +94,7 @@ class TRPO(OnPolicyAlgorithm):
|
|||
normalize_advantage: bool = True,
|
||||
target_kl: float = 0.01,
|
||||
sub_sampling_factor: int = 1,
|
||||
stats_window_size: int = 100,
|
||||
tensorboard_log: Optional[str] = None,
|
||||
policy_kwargs: Optional[Dict[str, Any]] = None,
|
||||
verbose: int = 0,
|
||||
|
|
@ -111,6 +114,7 @@ class TRPO(OnPolicyAlgorithm):
|
|||
max_grad_norm=0.0,
|
||||
use_sde=use_sde,
|
||||
sde_sample_freq=sde_sample_freq,
|
||||
stats_window_size=stats_window_size,
|
||||
tensorboard_log=tensorboard_log,
|
||||
policy_kwargs=policy_kwargs,
|
||||
verbose=verbose,
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
1.8.0a13
|
||||
1.8.0a14
|
||||
|
|
|
|||
2
setup.py
2
setup.py
|
|
@ -65,7 +65,7 @@ setup(
|
|||
packages=[package for package in find_packages() if package.startswith("sb3_contrib")],
|
||||
package_data={"sb3_contrib": ["py.typed", "version.txt"]},
|
||||
install_requires=[
|
||||
"stable_baselines3>=1.8.0a13",
|
||||
"stable_baselines3>=1.8.0a14",
|
||||
],
|
||||
description="Contrib package of Stable Baselines3, experimental code.",
|
||||
author="Antonin Raffin",
|
||||
|
|
|
|||
|
|
@ -148,3 +148,14 @@ def test_advantage_normalization(normalize_advantage):
|
|||
env = InvalidActionEnvDiscrete(dim=80, n_invalid_actions=60)
|
||||
model = MaskablePPO("MlpPolicy", env, n_steps=64, normalize_advantage=normalize_advantage)
|
||||
model.learn(64)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("algo", [TRPO, QRDQN])
|
||||
@pytest.mark.parametrize("stats_window_size", [1, 42])
|
||||
def test_ep_buffers_stats_window_size(algo, stats_window_size):
|
||||
"""Set stats_window_size for logging to non-default value and check if
|
||||
ep_info_buffer and ep_success_buffer are initialized to the correct length"""
|
||||
model = algo("MlpPolicy", "CartPole-v1", stats_window_size=stats_window_size)
|
||||
model.learn(total_timesteps=10)
|
||||
assert model.ep_info_buffer.maxlen == stats_window_size
|
||||
assert model.ep_success_buffer.maxlen == stats_window_size
|
||||
|
|
|
|||
Loading…
Reference in New Issue