diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index caf2bc0..24c1295 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -3,6 +3,29 @@ Changelog ========== + +Release 1.6.2 (2022-10-10) +-------------------------- + +**Progress bar and upgrade to latest SB3 version** + +Breaking Changes: +^^^^^^^^^^^^^^^^^ +- Upgraded to Stable-Baselines3 >= 1.6.2 + +New Features: +^^^^^^^^^^^^^ + +Bug Fixes: +^^^^^^^^^^ + +Deprecations: +^^^^^^^^^^^^^ +- Deprecate parameters ``eval_env``, ``eval_freq`` and ``create_eval_env`` + +Others: +^^^^^^^ + Release 1.6.1 (2022-09-29) ------------------------------- @@ -13,7 +36,6 @@ Breaking Changes: - Fixed the issue that ``predict`` does not always return action as ``np.ndarray`` (@qgallouedec) - Upgraded to Stable-Baselines3 >= 1.6.1 - New Features: ^^^^^^^^^^^^^ @@ -25,7 +47,6 @@ Bug Fixes: - Fixed missing verbose parameter passing in the ``MaskableEvalCallback`` constructor (@burakdmb) - Fixed the issue that when updating the target network in QRDQN, TQC, the ``running_mean`` and ``running_var`` properties of batch norm layers are not updated (@honglu2875) - Deprecations: ^^^^^^^^^^^^^ @@ -35,7 +56,7 @@ Others: Release 1.6.0 (2022-07-11) -------------------------------- +-------------------------- **Add RecurrentPPO (aka PPO LSTM)** diff --git a/sb3_contrib/ars/ars.py b/sb3_contrib/ars/ars.py index e5d7847..a5bc1a1 100644 --- a/sb3_contrib/ars/ars.py +++ b/sb3_contrib/ars/ars.py @@ -321,8 +321,12 @@ class ARS(BaseAlgorithm): :param callback: callback(s) called at every step with state of the algorithm. :param log_interval: The number of timesteps before logging. :param tb_log_name: the name of the run for TensorBoard logging - :param eval_env: Environment that will be used to evaluate the agent - :param eval_freq: Evaluate the agent every ``eval_freq`` timesteps (this may vary a little) + :param eval_env: Environment to use for evaluation. + Caution, this parameter is deprecated and will be removed in the future. + Please use `EvalCallback` or a custom Callback instead. + :param eval_freq: Evaluate the agent every ``eval_freq`` timesteps (this may vary a little). + Caution, this parameter is deprecated and will be removed in the future. + Please use `EvalCallback` or a custom Callback instead. :param n_eval_episodes: Number of episode to evaluate the agent :param eval_log_path: Path to a folder where the evaluations will be saved :param reset_num_timesteps: whether or not to reset the current timestep number (used in logging) diff --git a/sb3_contrib/ppo_mask/ppo_mask.py b/sb3_contrib/ppo_mask/ppo_mask.py index 451a02a..86e8d21 100644 --- a/sb3_contrib/ppo_mask/ppo_mask.py +++ b/sb3_contrib/ppo_mask/ppo_mask.py @@ -1,5 +1,6 @@ import sys import time +import warnings from collections import deque from typing import Any, Dict, Optional, Tuple, Type, Union @@ -59,7 +60,8 @@ class MaskablePPO(OnPolicyAlgorithm): By default, there is no limit on the kl div. :param tensorboard_log: the log location for tensorboard (if None, no logging) :param create_eval_env: Whether to create a second environment that will be - used for evaluating the agent periodically. (Only available when passing string for the environment) + used for evaluating the agent periodically (Only available when passing string for the environment). + Caution, this parameter is deprecated and will be removed in the future. :param policy_kwargs: additional arguments to be passed to the policy on creation :param verbose: the verbosity level: 0 no output, 1 info, 2 debug :param seed: Seed for the pseudo random generators @@ -183,9 +185,13 @@ class MaskablePPO(OnPolicyAlgorithm): ) -> BaseCallback: """ :param callback: Callback(s) called at every step with state of the algorithm. - :param eval_freq: How many steps between evaluations; if None, do not evaluate. + :param eval_env: Environment to use for evaluation. + Caution, this parameter is deprecated and will be removed in the future. + Please use `MaskableEvalCallback` or a custom Callback instead. + :param eval_freq: Evaluate the agent every ``eval_freq`` timesteps (this may vary a little). + Caution, this parameter is deprecated and will be removed in the future. + Please use `MaskableEvalCallback` or a custom Callback instead. :param n_eval_episodes: How many episodes to play per evaluation - :param n_eval_episodes: Number of episodes to rollout during evaluation. :param log_path: Path to a folder where the evaluations will be saved :param use_masking: Whether or not to use invalid action masks during evaluation :return: A hybrid callback calling `callback` and performing evaluation. @@ -234,8 +240,12 @@ class MaskablePPO(OnPolicyAlgorithm): :param total_timesteps: The total number of samples (env steps) to train on :param eval_env: Environment to use for evaluation. + Caution, this parameter is deprecated and will be removed in the future. + Please use `MaskableEvalCallback` or a custom Callback instead. :param callback: Callback(s) called at every step with state of the algorithm. - :param eval_freq: How many steps between evaluations + :param eval_freq: Evaluate the agent every ``eval_freq`` timesteps (this may vary a little). + Caution, this parameter is deprecated and will be removed in the future. + Please use `MaskableEvalCallback` or a custom Callback instead. :param n_eval_episodes: How many episodes to play per evaluation :param log_path: Path to a folder where the evaluations will be saved :param reset_num_timesteps: Whether to reset or not the ``num_timesteps`` attribute @@ -244,6 +254,17 @@ class MaskablePPO(OnPolicyAlgorithm): :return: """ + if eval_env is not None or eval_freq != -1: + warnings.warn( + "Parameters `eval_env` and `eval_freq` are deprecated and will be removed in the future. " + "Please use `MaskableEvalCallback` or a custom Callback instead.", + DeprecationWarning, + # By setting the `stacklevel` we refer to the initial caller of the deprecated feature. + # This causes the the `DepricationWarning` to not be ignored and to be shown to the user. See + # https://github.com/DLR-RM/stable-baselines3/pull/1082#discussion_r989842855 for more details. + stacklevel=4, + ) + self.start_time = time.time_ns() if self.ep_info_buffer is None or reset_num_timesteps: # Initialize buffers if they don't exist, or reinitialize if resetting counters diff --git a/sb3_contrib/ppo_recurrent/ppo_recurrent.py b/sb3_contrib/ppo_recurrent/ppo_recurrent.py index 645375e..5580fde 100644 --- a/sb3_contrib/ppo_recurrent/ppo_recurrent.py +++ b/sb3_contrib/ppo_recurrent/ppo_recurrent.py @@ -57,7 +57,8 @@ class RecurrentPPO(OnPolicyAlgorithm): By default, there is no limit on the kl div. :param tensorboard_log: the log location for tensorboard (if None, no logging) :param create_eval_env: Whether to create a second environment that will be - used for evaluating the agent periodically. (Only available when passing string for the environment) + used for evaluating the agent periodically (Only available when passing string for the environment). + Caution, this parameter is deprecated and will be removed in the future. :param policy_kwargs: additional arguments to be passed to the policy on creation :param verbose: the verbosity level: 0 no output, 1 info, 2 debug :param seed: Seed for the pseudo random generators @@ -211,8 +212,12 @@ class RecurrentPPO(OnPolicyAlgorithm): :param total_timesteps: The total number of samples (env steps) to train on :param eval_env: Environment to use for evaluation. + Caution, this parameter is deprecated and will be removed in the future. + Please use `EvalCallback` or a custom Callback instead. :param callback: Callback(s) called at every step with state of the algorithm. - :param eval_freq: How many steps between evaluations + :param eval_freq: Evaluate the agent every ``eval_freq`` timesteps (this may vary a little). + Caution, this parameter is deprecated and will be removed in the future. + Please use `EvalCallback` or a custom Callback instead. :param n_eval_episodes: How many episodes to play per evaluation :param log_path: Path to a folder where the evaluations will be saved :param reset_num_timesteps: Whether to reset or not the ``num_timesteps`` attribute diff --git a/sb3_contrib/qrdqn/qrdqn.py b/sb3_contrib/qrdqn/qrdqn.py index 273665f..67242a3 100644 --- a/sb3_contrib/qrdqn/qrdqn.py +++ b/sb3_contrib/qrdqn/qrdqn.py @@ -49,7 +49,8 @@ class QRDQN(OffPolicyAlgorithm): :param max_grad_norm: The maximum value for the gradient clipping (if None, no clipping) :param tensorboard_log: the log location for tensorboard (if None, no logging) :param create_eval_env: Whether to create a second environment that will be - used for evaluating the agent periodically. (Only available when passing string for the environment) + used for evaluating the agent periodically (Only available when passing string for the environment). + Caution, this parameter is deprecated and will be removed in the future. :param policy_kwargs: additional arguments to be passed to the policy on creation :param verbose: the verbosity level: 0 no output, 1 info, 2 debug :param seed: Seed for the pseudo random generators diff --git a/sb3_contrib/tqc/tqc.py b/sb3_contrib/tqc/tqc.py index 8ec746b..ed2f389 100644 --- a/sb3_contrib/tqc/tqc.py +++ b/sb3_contrib/tqc/tqc.py @@ -56,7 +56,8 @@ class TQC(OffPolicyAlgorithm): :param use_sde_at_warmup: Whether to use gSDE instead of uniform sampling during the warm up phase (before learning starts) :param create_eval_env: Whether to create a second environment that will be - used for evaluating the agent periodically. (Only available when passing string for the environment) + used for evaluating the agent periodically (Only available when passing string for the environment). + Caution, this parameter is deprecated and will be removed in the future. :param policy_kwargs: additional arguments to be passed to the policy on creation :param verbose: the verbosity level: 0 no output, 1 info, 2 debug :param seed: Seed for the pseudo random generators diff --git a/sb3_contrib/trpo/trpo.py b/sb3_contrib/trpo/trpo.py index cb35d73..b15a869 100644 --- a/sb3_contrib/trpo/trpo.py +++ b/sb3_contrib/trpo/trpo.py @@ -58,7 +58,8 @@ class TRPO(OnPolicyAlgorithm): see p40-42 of John Schulman thesis http://joschu.net/docs/thesis.pdf :param tensorboard_log: the log location for tensorboard (if None, no logging) :param create_eval_env: Whether to create a second environment that will be - used for evaluating the agent periodically. (Only available when passing string for the environment) + used for evaluating the agent periodically (Only available when passing string for the environment). + Caution, this parameter is deprecated and will be removed in the future. :param policy_kwargs: additional arguments to be passed to the policy on creation :param verbose: the verbosity level: 0 no output, 1 info, 2 debug :param seed: Seed for the pseudo random generators diff --git a/sb3_contrib/version.txt b/sb3_contrib/version.txt index 9c6d629..fdd3be6 100644 --- a/sb3_contrib/version.txt +++ b/sb3_contrib/version.txt @@ -1 +1 @@ -1.6.1 +1.6.2 diff --git a/setup.cfg b/setup.cfg index 4a0cf75..63251ee 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] # This includes the license file in the wheel. -license_file = LICENSE +license_files = LICENSE [tool:pytest] # Deterministic ordering for tests; useful for pytest-xdist. diff --git a/setup.py b/setup.py index 8d85487..7c6b397 100644 --- a/setup.py +++ b/setup.py @@ -65,7 +65,7 @@ setup( packages=[package for package in find_packages() if package.startswith("sb3_contrib")], package_data={"sb3_contrib": ["py.typed", "version.txt"]}, install_requires=[ - "stable_baselines3>=1.6.1", + "stable_baselines3>=1.6.2", ], description="Contrib package of Stable Baselines3, experimental code.", author="Antonin Raffin", diff --git a/tests/test_invalid_actions.py b/tests/test_invalid_actions.py index 7187800..3509914 100644 --- a/tests/test_invalid_actions.py +++ b/tests/test_invalid_actions.py @@ -68,7 +68,8 @@ def test_eval_env(): env = InvalidActionEnvDiscrete(dim=20, n_invalid_actions=10) eval_env = InvalidActionEnvDiscrete(dim=20, n_invalid_actions=10) model = MaskablePPO("MlpPolicy", env, clip_range_vf=0.2, n_steps=32, seed=8) - model.learn(32, eval_env=eval_env, eval_freq=16) + with pytest.warns(DeprecationWarning): # `eval_env` is deprecated + model.learn(32, eval_env=eval_env, eval_freq=16) model.learn(32, reset_num_timesteps=False) diff --git a/tests/test_lstm.py b/tests/test_lstm.py index eb320bf..799b8dc 100644 --- a/tests/test_lstm.py +++ b/tests/test_lstm.py @@ -129,30 +129,32 @@ def test_check(): @pytest.mark.parametrize("env", ["Pendulum-v1", "CartPole-v1"]) def test_run(env): - model = RecurrentPPO( - "MlpLstmPolicy", - env, - n_steps=16, - seed=0, - create_eval_env=True, - ) + with pytest.warns(DeprecationWarning): # `create_eval_env` and `eval_freq` are deprecated + model = RecurrentPPO( + "MlpLstmPolicy", + env, + n_steps=16, + seed=0, + create_eval_env=True, + ) - model.learn(total_timesteps=32, eval_freq=16) + model.learn(total_timesteps=32, eval_freq=16) def test_run_sde(): - model = RecurrentPPO( - "MlpLstmPolicy", - "Pendulum-v1", - n_steps=16, - seed=0, - create_eval_env=True, - sde_sample_freq=4, - use_sde=True, - clip_range_vf=0.1, - ) + with pytest.warns(DeprecationWarning): # `create_eval_env` and `eval_freq` are deprecated + model = RecurrentPPO( + "MlpLstmPolicy", + "Pendulum-v1", + n_steps=16, + seed=0, + create_eval_env=True, + sde_sample_freq=4, + use_sde=True, + clip_range_vf=0.1, + ) - model.learn(total_timesteps=200, eval_freq=150) + model.learn(total_timesteps=200, eval_freq=150) @pytest.mark.parametrize( diff --git a/tests/test_run.py b/tests/test_run.py index c9c8584..c409e4c 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -10,16 +10,17 @@ from sb3_contrib.common.vec_env import AsyncEval @pytest.mark.parametrize("ent_coef", ["auto", 0.01, "auto_0.01"]) def test_tqc(ent_coef): - model = TQC( - "MlpPolicy", - "Pendulum-v1", - policy_kwargs=dict(net_arch=[64, 64]), - learning_starts=100, - verbose=1, - create_eval_env=True, - ent_coef=ent_coef, - ) - model.learn(total_timesteps=300, eval_freq=250) + with pytest.warns(DeprecationWarning): # `create_eval_env` and `eval_freq` are deprecated + model = TQC( + "MlpPolicy", + "Pendulum-v1", + policy_kwargs=dict(net_arch=[64, 64]), + learning_starts=100, + verbose=1, + create_eval_env=True, + ent_coef=ent_coef, + ) + model.learn(total_timesteps=300, eval_freq=250) @pytest.mark.parametrize("n_critics", [1, 3]) @@ -50,17 +51,18 @@ def test_sde(): def test_qrdqn(): - model = QRDQN( - "MlpPolicy", - "CartPole-v1", - policy_kwargs=dict(n_quantiles=25, net_arch=[64, 64]), - learning_starts=100, - buffer_size=500, - learning_rate=3e-4, - verbose=1, - create_eval_env=True, - ) - model.learn(total_timesteps=500, eval_freq=250) + with pytest.warns(DeprecationWarning): # `create_eval_env` and `eval_freq` are deprecated + model = QRDQN( + "MlpPolicy", + "CartPole-v1", + policy_kwargs=dict(n_quantiles=25, net_arch=[64, 64]), + learning_starts=100, + buffer_size=500, + learning_rate=3e-4, + verbose=1, + create_eval_env=True, + ) + model.learn(total_timesteps=500, eval_freq=250) @pytest.mark.parametrize("env_id", ["CartPole-v1", "Pendulum-v1"]) @@ -89,7 +91,8 @@ def test_trpo_params(): @pytest.mark.parametrize("policy_str", ["LinearPolicy", "MlpPolicy"]) def test_ars(policy_str, env_id): model = ARS(policy_str, env_id, n_delta=1, verbose=1, seed=0) - model.learn(total_timesteps=500, log_interval=1, eval_freq=250) + with pytest.warns(DeprecationWarning): # `create_eval_env` and `eval_freq` are deprecated + model.learn(total_timesteps=500, log_interval=1, eval_freq=250) def test_ars_multi_env():