Deprecate ``create_eval_env``, ``eval_env`` and ``eval_freq`` parameter (#105)

* Deprecate ``eval_env``, ``eval_freq```and ``create_eval_env`` * Update changelog * Typo * Raise deprecation warining in _setup_learn * Upgrade to latest SB3 version and update changelog Co-authored-by: Antonin Raffin <antonin.raffin@ensta.org>
2022-10-10 17:12:40 +02:00 · 2022-10-10 17:12:40 +02:00 · dec7b5303a
parent 2490468b11
commit dec7b5303a
13 changed files with 119 additions and 59 deletions
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@ -3,6 +3,29 @@
 Changelog
 ==========

+
+Release 1.6.2 (2022-10-10)
+--------------------------
+
+**Progress bar and upgrade to latest SB3 version**
+
+Breaking Changes:
+^^^^^^^^^^^^^^^^^
+- Upgraded to Stable-Baselines3 >= 1.6.2
+
+New Features:
+^^^^^^^^^^^^^
+
+Bug Fixes:
+^^^^^^^^^^
+
+Deprecations:
+^^^^^^^^^^^^^
+- Deprecate parameters ``eval_env``, ``eval_freq`` and ``create_eval_env``
+
+Others:
+^^^^^^^
+
 Release 1.6.1 (2022-09-29)
 -------------------------------

@ -13,7 +36,6 @@ Breaking Changes:
 - Fixed the issue that ``predict`` does not always return action as ``np.ndarray`` (@qgallouedec)
 - Upgraded to Stable-Baselines3 >= 1.6.1

-
 New Features:
 ^^^^^^^^^^^^^

@ -25,7 +47,6 @@ Bug Fixes:
 - Fixed missing verbose parameter passing in the ``MaskableEvalCallback`` constructor (@burakdmb)
 - Fixed the issue that when updating the target network in QRDQN, TQC, the ``running_mean`` and ``running_var`` properties of batch norm layers are not updated (@honglu2875)

-
 Deprecations:
 ^^^^^^^^^^^^^

@ -35,7 +56,7 @@ Others:


 Release 1.6.0 (2022-07-11)
-------------------------------
+--------------------------

 **Add RecurrentPPO (aka PPO LSTM)**

--- a/sb3_contrib/ars/ars.py
+++ b/sb3_contrib/ars/ars.py
@ -321,8 +321,12 @@ class ARS(BaseAlgorithm):
        :param callback: callback(s) called at every step with state of the algorithm.
        :param log_interval: The number of timesteps before logging.
        :param tb_log_name: the name of the run for TensorBoard logging
-        :param eval_env: Environment that will be used to evaluate the agent
-        :param eval_freq: Evaluate the agent every ``eval_freq`` timesteps (this may vary a little)
+        :param eval_env: Environment to use for evaluation.
+            Caution, this parameter is deprecated and will be removed in the future.
+            Please use `EvalCallback` or a custom Callback instead.
+        :param eval_freq: Evaluate the agent every ``eval_freq`` timesteps (this may vary a little).
+            Caution, this parameter is deprecated and will be removed in the future.
+            Please use `EvalCallback` or a custom Callback instead.
        :param n_eval_episodes: Number of episode to evaluate the agent
        :param eval_log_path: Path to a folder where the evaluations will be saved
        :param reset_num_timesteps: whether or not to reset the current timestep number (used in logging)
--- a/sb3_contrib/ppo_mask/ppo_mask.py
+++ b/sb3_contrib/ppo_mask/ppo_mask.py
@ -1,5 +1,6 @@
 import sys
 import time
+import warnings
 from collections import deque
 from typing import Any, Dict, Optional, Tuple, Type, Union

@ -59,7 +60,8 @@ class MaskablePPO(OnPolicyAlgorithm):
        By default, there is no limit on the kl div.
    :param tensorboard_log: the log location for tensorboard (if None, no logging)
    :param create_eval_env: Whether to create a second environment that will be
-        used for evaluating the agent periodically. (Only available when passing string for the environment)
+        used for evaluating the agent periodically (Only available when passing string for the environment).
+        Caution, this parameter is deprecated and will be removed in the future.
    :param policy_kwargs: additional arguments to be passed to the policy on creation
    :param verbose: the verbosity level: 0 no output, 1 info, 2 debug
    :param seed: Seed for the pseudo random generators
@ -183,9 +185,13 @@ class MaskablePPO(OnPolicyAlgorithm):
    ) -> BaseCallback:
        """
        :param callback: Callback(s) called at every step with state of the algorithm.
-        :param eval_freq: How many steps between evaluations; if None, do not evaluate.
+        :param eval_env: Environment to use for evaluation.
+            Caution, this parameter is deprecated and will be removed in the future.
+            Please use `MaskableEvalCallback` or a custom Callback instead.
+        :param eval_freq: Evaluate the agent every ``eval_freq`` timesteps (this may vary a little).
+            Caution, this parameter is deprecated and will be removed in the future.
+            Please use `MaskableEvalCallback` or a custom Callback instead.
        :param n_eval_episodes: How many episodes to play per evaluation
-        :param n_eval_episodes: Number of episodes to rollout during evaluation.
        :param log_path: Path to a folder where the evaluations will be saved
        :param use_masking: Whether or not to use invalid action masks during evaluation
        :return: A hybrid callback calling `callback` and performing evaluation.
@ -234,8 +240,12 @@ class MaskablePPO(OnPolicyAlgorithm):

        :param total_timesteps: The total number of samples (env steps) to train on
        :param eval_env: Environment to use for evaluation.
+            Caution, this parameter is deprecated and will be removed in the future.
+            Please use `MaskableEvalCallback` or a custom Callback instead.
        :param callback: Callback(s) called at every step with state of the algorithm.
-        :param eval_freq: How many steps between evaluations
+        :param eval_freq: Evaluate the agent every ``eval_freq`` timesteps (this may vary a little).
+            Caution, this parameter is deprecated and will be removed in the future.
+            Please use `MaskableEvalCallback` or a custom Callback instead.
        :param n_eval_episodes: How many episodes to play per evaluation
        :param log_path: Path to a folder where the evaluations will be saved
        :param reset_num_timesteps: Whether to reset or not the ``num_timesteps`` attribute
@ -244,6 +254,17 @@ class MaskablePPO(OnPolicyAlgorithm):
        :return:
        """

+        if eval_env is not None or eval_freq != -1:
+            warnings.warn(
+                "Parameters `eval_env` and `eval_freq` are deprecated and will be removed in the future. "
+                "Please use `MaskableEvalCallback` or a custom Callback instead.",
+                DeprecationWarning,
+                # By setting the `stacklevel` we refer to the initial caller of the deprecated feature.
+                # This causes the the `DepricationWarning` to not be ignored and to be shown to the user. See
+                # https://github.com/DLR-RM/stable-baselines3/pull/1082#discussion_r989842855 for more details.
+                stacklevel=4,
+            )
+
        self.start_time = time.time_ns()
        if self.ep_info_buffer is None or reset_num_timesteps:
            # Initialize buffers if they don't exist, or reinitialize if resetting counters
--- a/sb3_contrib/ppo_recurrent/ppo_recurrent.py
+++ b/sb3_contrib/ppo_recurrent/ppo_recurrent.py
@ -57,7 +57,8 @@ class RecurrentPPO(OnPolicyAlgorithm):
        By default, there is no limit on the kl div.
    :param tensorboard_log: the log location for tensorboard (if None, no logging)
    :param create_eval_env: Whether to create a second environment that will be
-        used for evaluating the agent periodically. (Only available when passing string for the environment)
+        used for evaluating the agent periodically (Only available when passing string for the environment).
+        Caution, this parameter is deprecated and will be removed in the future.
    :param policy_kwargs: additional arguments to be passed to the policy on creation
    :param verbose: the verbosity level: 0 no output, 1 info, 2 debug
    :param seed: Seed for the pseudo random generators
@ -211,8 +212,12 @@ class RecurrentPPO(OnPolicyAlgorithm):

        :param total_timesteps: The total number of samples (env steps) to train on
        :param eval_env: Environment to use for evaluation.
+            Caution, this parameter is deprecated and will be removed in the future.
+            Please use `EvalCallback` or a custom Callback instead.
        :param callback: Callback(s) called at every step with state of the algorithm.
-        :param eval_freq: How many steps between evaluations
+        :param eval_freq: Evaluate the agent every ``eval_freq`` timesteps (this may vary a little).
+            Caution, this parameter is deprecated and will be removed in the future.
+            Please use `EvalCallback` or a custom Callback instead.
        :param n_eval_episodes: How many episodes to play per evaluation
        :param log_path: Path to a folder where the evaluations will be saved
        :param reset_num_timesteps: Whether to reset or not the ``num_timesteps`` attribute
--- a/sb3_contrib/qrdqn/qrdqn.py
+++ b/sb3_contrib/qrdqn/qrdqn.py
@ -49,7 +49,8 @@ class QRDQN(OffPolicyAlgorithm):
    :param max_grad_norm: The maximum value for the gradient clipping (if None, no clipping)
    :param tensorboard_log: the log location for tensorboard (if None, no logging)
    :param create_eval_env: Whether to create a second environment that will be
-        used for evaluating the agent periodically. (Only available when passing string for the environment)
+        used for evaluating the agent periodically (Only available when passing string for the environment).
+        Caution, this parameter is deprecated and will be removed in the future.
    :param policy_kwargs: additional arguments to be passed to the policy on creation
    :param verbose: the verbosity level: 0 no output, 1 info, 2 debug
    :param seed: Seed for the pseudo random generators
--- a/sb3_contrib/tqc/tqc.py
+++ b/sb3_contrib/tqc/tqc.py
@ -56,7 +56,8 @@ class TQC(OffPolicyAlgorithm):
    :param use_sde_at_warmup: Whether to use gSDE instead of uniform sampling
        during the warm up phase (before learning starts)
    :param create_eval_env: Whether to create a second environment that will be
-        used for evaluating the agent periodically. (Only available when passing string for the environment)
+        used for evaluating the agent periodically (Only available when passing string for the environment).
+        Caution, this parameter is deprecated and will be removed in the future.
    :param policy_kwargs: additional arguments to be passed to the policy on creation
    :param verbose: the verbosity level: 0 no output, 1 info, 2 debug
    :param seed: Seed for the pseudo random generators
--- a/sb3_contrib/trpo/trpo.py
+++ b/sb3_contrib/trpo/trpo.py
@ -58,7 +58,8 @@ class TRPO(OnPolicyAlgorithm):
        see p40-42 of John Schulman thesis http://joschu.net/docs/thesis.pdf
    :param tensorboard_log: the log location for tensorboard (if None, no logging)
    :param create_eval_env: Whether to create a second environment that will be
-        used for evaluating the agent periodically. (Only available when passing string for the environment)
+        used for evaluating the agent periodically (Only available when passing string for the environment).
+        Caution, this parameter is deprecated and will be removed in the future.
    :param policy_kwargs: additional arguments to be passed to the policy on creation
    :param verbose: the verbosity level: 0 no output, 1 info, 2 debug
    :param seed: Seed for the pseudo random generators
--- a/sb3_contrib/version.txt
+++ b/sb3_contrib/version.txt
@ -1 +1 @@
-1.6.1
+1.6.2
--- a/setup.cfg
+++ b/setup.cfg
@ -1,6 +1,6 @@
 [metadata]
 # This includes the license file in the wheel.
-license_file = LICENSE
+license_files = LICENSE

 [tool:pytest]
 # Deterministic ordering for tests; useful for pytest-xdist.
--- a/setup.py
+++ b/setup.py
@ -65,7 +65,7 @@ setup(
    packages=[package for package in find_packages() if package.startswith("sb3_contrib")],
    package_data={"sb3_contrib": ["py.typed", "version.txt"]},
    install_requires=[
-        "stable_baselines3>=1.6.1",
+        "stable_baselines3>=1.6.2",
    ],
    description="Contrib package of Stable Baselines3, experimental code.",
    author="Antonin Raffin",
--- a/tests/test_invalid_actions.py
+++ b/tests/test_invalid_actions.py
@ -68,6 +68,7 @@ def test_eval_env():
    env = InvalidActionEnvDiscrete(dim=20, n_invalid_actions=10)
    eval_env = InvalidActionEnvDiscrete(dim=20, n_invalid_actions=10)
    model = MaskablePPO("MlpPolicy", env, clip_range_vf=0.2, n_steps=32, seed=8)
+    with pytest.warns(DeprecationWarning):  # `eval_env` is deprecated
        model.learn(32, eval_env=eval_env, eval_freq=16)
    model.learn(32, reset_num_timesteps=False)

--- a/tests/test_lstm.py
+++ b/tests/test_lstm.py
@ -129,6 +129,7 @@ def test_check():

@pytest.mark.parametrize("env", ["Pendulum-v1", "CartPole-v1"])
 def test_run(env):
+    with pytest.warns(DeprecationWarning):  # `create_eval_env` and `eval_freq` are deprecated
        model = RecurrentPPO(
            "MlpLstmPolicy",
            env,
@ -141,6 +142,7 @@ def test_run(env):


 def test_run_sde():
+    with pytest.warns(DeprecationWarning):  # `create_eval_env` and `eval_freq` are deprecated
        model = RecurrentPPO(
            "MlpLstmPolicy",
            "Pendulum-v1",
--- a/tests/test_run.py
+++ b/tests/test_run.py
@ -10,6 +10,7 @@ from sb3_contrib.common.vec_env import AsyncEval

@pytest.mark.parametrize("ent_coef", ["auto", 0.01, "auto_0.01"])
 def test_tqc(ent_coef):
+    with pytest.warns(DeprecationWarning):  # `create_eval_env` and `eval_freq` are deprecated
        model = TQC(
            "MlpPolicy",
            "Pendulum-v1",
@ -50,6 +51,7 @@ def test_sde():


 def test_qrdqn():
+    with pytest.warns(DeprecationWarning):  # `create_eval_env` and `eval_freq` are deprecated
        model = QRDQN(
            "MlpPolicy",
            "CartPole-v1",
@ -89,6 +91,7 @@ def test_trpo_params():
@pytest.mark.parametrize("policy_str", ["LinearPolicy", "MlpPolicy"])
 def test_ars(policy_str, env_id):
    model = ARS(policy_str, env_id, n_delta=1, verbose=1, seed=0)
+    with pytest.warns(DeprecationWarning):  # `create_eval_env` and `eval_freq` are deprecated
        model.learn(total_timesteps=500, log_interval=1, eval_freq=250)
 @ -1 +1 @@
 .6.1
 .6.2