From c75ad7dd58b7634e48c9e345fca8ebb06af3495e Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Tue, 11 Oct 2022 13:04:18 +0200
Subject: [PATCH] Remove deprecated features (#108)

* Remove deprecated features

* Upgrade SB3

* Fix tests
---
 docs/misc/changelog.rst                    | 21 +++++++
 sb3_contrib/ars/ars.py                     | 16 -----
 sb3_contrib/common/maskable/evaluation.py  | 10 +--
 sb3_contrib/common/recurrent/policies.py   | 15 -----
 sb3_contrib/ppo_mask/ppo_mask.py           | 72 +---------------------
 sb3_contrib/ppo_recurrent/ppo_recurrent.py | 13 ----
 sb3_contrib/qrdqn/qrdqn.py                 | 13 ----
 sb3_contrib/tqc/policies.py                | 26 --------
 sb3_contrib/tqc/tqc.py                     | 13 ----
 sb3_contrib/trpo/trpo.py                   | 13 ----
 sb3_contrib/version.txt                    |  2 +-
 setup.py                                   |  2 +-
 tests/test_invalid_actions.py              |  9 ---
 tests/test_lstm.py                         | 38 +++++-------
 tests/test_run.py                          | 45 ++++++--------
 15 files changed, 67 insertions(+), 241 deletions(-)

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index f9be489..5cd2d73 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -3,6 +3,27 @@
 Changelog
 ==========
 
+Release 1.7.0a0 (WIP)
+--------------------------
+
+Breaking Changes:
+^^^^^^^^^^^^^^^^^
+- Removed deprecated ``create_eval_env``, ``eval_env``, ``eval_log_path``, ``n_eval_episodes`` and ``eval_freq`` parameters,
+  please use an ``EvalCallback`` instead
+- Removed deprecated ``sde_net_arch`` parameter
+
+New Features:
+^^^^^^^^^^^^^
+
+Bug Fixes:
+^^^^^^^^^^
+
+Deprecations:
+^^^^^^^^^^^^^
+
+Others:
+^^^^^^^
+
 
 Release 1.6.2 (2022-10-10)
 --------------------------
diff --git a/sb3_contrib/ars/ars.py b/sb3_contrib/ars/ars.py
index f5ac48a..33a0579 100644
--- a/sb3_contrib/ars/ars.py
+++ b/sb3_contrib/ars/ars.py
@@ -309,10 +309,6 @@ class ARS(BaseAlgorithm):
         callback: MaybeCallback = None,
         log_interval: int = 1,
         tb_log_name: str = "ARS",
-        eval_env: Optional[GymEnv] = None,
-        eval_freq: int = -1,
-        n_eval_episodes: int = 5,
-        eval_log_path: Optional[str] = None,
         reset_num_timesteps: bool = True,
         async_eval: Optional[AsyncEval] = None,
         progress_bar: bool = False,
@@ -324,14 +320,6 @@ class ARS(BaseAlgorithm):
         :param callback: callback(s) called at every step with state of the algorithm.
         :param log_interval: The number of timesteps before logging.
         :param tb_log_name: the name of the run for TensorBoard logging
-        :param eval_env: Environment to use for evaluation.
-            Caution, this parameter is deprecated and will be removed in the future.
-            Please use `EvalCallback` or a custom Callback instead.
-        :param eval_freq: Evaluate the agent every ``eval_freq`` timesteps (this may vary a little).
-            Caution, this parameter is deprecated and will be removed in the future.
-            Please use `EvalCallback` or a custom Callback instead.
-        :param n_eval_episodes: Number of episode to evaluate the agent
-        :param eval_log_path: Path to a folder where the evaluations will be saved
         :param reset_num_timesteps: whether or not to reset the current timestep number (used in logging)
         :param async_eval: The object for asynchronous evaluation of candidates.
         :param progress_bar: Display a progress bar using tqdm and rich.
@@ -340,11 +328,7 @@ class ARS(BaseAlgorithm):
 
         total_steps, callback = self._setup_learn(
             total_timesteps,
-            eval_env,
             callback,
-            eval_freq,
-            n_eval_episodes,
-            eval_log_path,
             reset_num_timesteps,
             tb_log_name,
             progress_bar,
diff --git a/sb3_contrib/common/maskable/evaluation.py b/sb3_contrib/common/maskable/evaluation.py
index 82b8cc3..63ff0b4 100644
--- a/sb3_contrib/common/maskable/evaluation.py
+++ b/sb3_contrib/common/maskable/evaluation.py
@@ -88,18 +88,21 @@ def evaluate_policy(  # noqa: C901
     current_lengths = np.zeros(n_envs, dtype="int")
     observations = env.reset()
     states = None
-
+    episode_starts = np.ones((env.num_envs,), dtype=bool)
     while (episode_counts < episode_count_targets).any():
         if use_masking:
             action_masks = get_action_masks(env)
             actions, state = model.predict(
                 observations,
                 state=states,
+                episode_start=episode_starts,
                 deterministic=deterministic,
                 action_masks=action_masks,
             )
         else:
-            actions, states = model.predict(observations, state=states, deterministic=deterministic)
+            actions, states = model.predict(
+                observations, state=states, episode_start=episode_starts, deterministic=deterministic
+            )
         observations, rewards, dones, infos = env.step(actions)
         current_rewards += rewards
         current_lengths += 1
@@ -110,6 +113,7 @@ def evaluate_policy(  # noqa: C901
                 reward = rewards[i]
                 done = dones[i]
                 info = infos[i]
+                episode_starts[i] = done
 
                 if callback is not None:
                     callback(locals(), globals())
@@ -133,8 +137,6 @@ def evaluate_policy(  # noqa: C901
                         episode_counts[i] += 1
                     current_rewards[i] = 0
                     current_lengths[i] = 0
-                    if states is not None:
-                        states[i] *= 0
 
         if render:
             env.render()
diff --git a/sb3_contrib/common/recurrent/policies.py b/sb3_contrib/common/recurrent/policies.py
index 1ba5273..cbc2718 100644
--- a/sb3_contrib/common/recurrent/policies.py
+++ b/sb3_contrib/common/recurrent/policies.py
@@ -36,9 +36,6 @@ class RecurrentActorCriticPolicy(ActorCriticPolicy):
     :param log_std_init: Initial value for the log standard deviation
     :param full_std: Whether to use (n_features x n_actions) parameters
         for the std instead of only (n_features,) when using gSDE
-    :param sde_net_arch: Network architecture for extracting features
-        when using gSDE. If None, the latent features from the policy will be used.
-        Pass an empty list to use the states as features.
     :param use_expln: Use ``expln()`` function instead of ``exp()`` to ensure
         a positive standard deviation (cf paper). It allows to keep variance
         above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
@@ -74,7 +71,6 @@ class RecurrentActorCriticPolicy(ActorCriticPolicy):
         use_sde: bool = False,
         log_std_init: float = 0.0,
         full_std: bool = True,
-        sde_net_arch: Optional[List[int]] = None,
         use_expln: bool = False,
         squash_output: bool = False,
         features_extractor_class: Type[BaseFeaturesExtractor] = FlattenExtractor,
@@ -99,7 +95,6 @@ class RecurrentActorCriticPolicy(ActorCriticPolicy):
             use_sde,
             log_std_init,
             full_std,
-            sde_net_arch,
             use_expln,
             squash_output,
             features_extractor_class,
@@ -436,9 +431,6 @@ class RecurrentActorCriticCnnPolicy(RecurrentActorCriticPolicy):
     :param log_std_init: Initial value for the log standard deviation
     :param full_std: Whether to use (n_features x n_actions) parameters
         for the std instead of only (n_features,) when using gSDE
-    :param sde_net_arch: Network architecture for extracting features
-        when using gSDE. If None, the latent features from the policy will be used.
-        Pass an empty list to use the states as features.
     :param use_expln: Use ``expln()`` function instead of ``exp()`` to ensure
         a positive standard deviation (cf paper). It allows to keep variance
         above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
@@ -473,7 +465,6 @@ class RecurrentActorCriticCnnPolicy(RecurrentActorCriticPolicy):
         use_sde: bool = False,
         log_std_init: float = 0.0,
         full_std: bool = True,
-        sde_net_arch: Optional[List[int]] = None,
         use_expln: bool = False,
         squash_output: bool = False,
         features_extractor_class: Type[BaseFeaturesExtractor] = NatureCNN,
@@ -497,7 +488,6 @@ class RecurrentActorCriticCnnPolicy(RecurrentActorCriticPolicy):
             use_sde,
             log_std_init,
             full_std,
-            sde_net_arch,
             use_expln,
             squash_output,
             features_extractor_class,
@@ -528,9 +518,6 @@ class RecurrentMultiInputActorCriticPolicy(RecurrentActorCriticPolicy):
     :param log_std_init: Initial value for the log standard deviation
     :param full_std: Whether to use (n_features x n_actions) parameters
         for the std instead of only (n_features,) when using gSDE
-    :param sde_net_arch: Network architecture for extracting features
-        when using gSDE. If None, the latent features from the policy will be used.
-        Pass an empty list to use the states as features.
     :param use_expln: Use ``expln()`` function instead of ``exp()`` to ensure
         a positive standard deviation (cf paper). It allows to keep variance
         above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
@@ -565,7 +552,6 @@ class RecurrentMultiInputActorCriticPolicy(RecurrentActorCriticPolicy):
         use_sde: bool = False,
         log_std_init: float = 0.0,
         full_std: bool = True,
-        sde_net_arch: Optional[List[int]] = None,
         use_expln: bool = False,
         squash_output: bool = False,
         features_extractor_class: Type[BaseFeaturesExtractor] = CombinedExtractor,
@@ -589,7 +575,6 @@ class RecurrentMultiInputActorCriticPolicy(RecurrentActorCriticPolicy):
             use_sde,
             log_std_init,
             full_std,
-            sde_net_arch,
             use_expln,
             squash_output,
             features_extractor_class,
diff --git a/sb3_contrib/ppo_mask/ppo_mask.py b/sb3_contrib/ppo_mask/ppo_mask.py
index 62f92f6..769461d 100644
--- a/sb3_contrib/ppo_mask/ppo_mask.py
+++ b/sb3_contrib/ppo_mask/ppo_mask.py
@@ -1,6 +1,5 @@
 import sys
 import time
-import warnings
 from collections import deque
 from typing import Any, Dict, Optional, Tuple, Type, TypeVar, Union
 
@@ -61,9 +60,6 @@ class MaskablePPO(OnPolicyAlgorithm):
         see issue #213 (cf https://github.com/hill-a/stable-baselines/issues/213)
         By default, there is no limit on the kl div.
     :param tensorboard_log: the log location for tensorboard (if None, no logging)
-    :param create_eval_env: Whether to create a second environment that will be
-        used for evaluating the agent periodically (Only available when passing string for the environment).
-        Caution, this parameter is deprecated and will be removed in the future.
     :param policy_kwargs: additional arguments to be passed to the policy on creation
     :param verbose: the verbosity level: 0 no output, 1 info, 2 debug
     :param seed: Seed for the pseudo random generators
@@ -96,7 +92,6 @@ class MaskablePPO(OnPolicyAlgorithm):
         max_grad_norm: float = 0.5,
         target_kl: Optional[float] = None,
         tensorboard_log: Optional[str] = None,
-        create_eval_env: bool = False,
         policy_kwargs: Optional[Dict[str, Any]] = None,
         verbose: int = 0,
         seed: Optional[int] = None,
@@ -116,7 +111,6 @@ class MaskablePPO(OnPolicyAlgorithm):
             use_sde=False,
             sde_sample_freq=-1,
             tensorboard_log=tensorboard_log,
-            create_eval_env=create_eval_env,
             policy_kwargs=policy_kwargs,
             verbose=verbose,
             seed=seed,
@@ -179,23 +173,11 @@ class MaskablePPO(OnPolicyAlgorithm):
     def _init_callback(
         self,
         callback: MaybeCallback,
-        eval_env: Optional[VecEnv] = None,
-        eval_freq: int = 10000,
-        n_eval_episodes: int = 5,
-        log_path: Optional[str] = None,
         use_masking: bool = True,
         progress_bar: bool = False,
     ) -> BaseCallback:
         """
         :param callback: Callback(s) called at every step with state of the algorithm.
-        :param eval_env: Environment to use for evaluation.
-            Caution, this parameter is deprecated and will be removed in the future.
-            Please use `MaskableEvalCallback` or a custom Callback instead.
-        :param eval_freq: Evaluate the agent every ``eval_freq`` timesteps (this may vary a little).
-            Caution, this parameter is deprecated and will be removed in the future.
-            Please use `MaskableEvalCallback` or a custom Callback instead.
-        :param n_eval_episodes: How many episodes to play per evaluation
-        :param log_path: Path to a folder where the evaluations will be saved
         :param use_masking: Whether or not to use invalid action masks during evaluation
         :param progress_bar: Display a progress bar using tqdm and rich.
         :return: A hybrid callback calling `callback` and performing evaluation.
@@ -212,33 +194,13 @@ class MaskablePPO(OnPolicyAlgorithm):
         if progress_bar:
             callback = CallbackList([callback, ProgressBarCallback()])
 
-        # Create eval callback in charge of the evaluation
-        if eval_env is not None:
-            # Avoid circular import error
-            from sb3_contrib.common.maskable.callbacks import MaskableEvalCallback
-
-            eval_callback = MaskableEvalCallback(
-                eval_env,
-                best_model_save_path=log_path,
-                log_path=log_path,
-                eval_freq=eval_freq,
-                n_eval_episodes=n_eval_episodes,
-                use_masking=use_masking,
-                verbose=self.verbose,
-            )
-            callback = CallbackList([callback, eval_callback])
-
         callback.init_callback(self)
         return callback
 
     def _setup_learn(
         self,
         total_timesteps: int,
-        eval_env: Optional[GymEnv],
         callback: MaybeCallback = None,
-        eval_freq: int = 10000,
-        n_eval_episodes: int = 5,
-        log_path: Optional[str] = None,
         reset_num_timesteps: bool = True,
         tb_log_name: str = "run",
         use_masking: bool = True,
@@ -248,15 +210,7 @@ class MaskablePPO(OnPolicyAlgorithm):
         Initialize different variables needed for training.
 
         :param total_timesteps: The total number of samples (env steps) to train on
-        :param eval_env: Environment to use for evaluation.
-            Caution, this parameter is deprecated and will be removed in the future.
-            Please use `MaskableEvalCallback` or a custom Callback instead.
         :param callback: Callback(s) called at every step with state of the algorithm.
-        :param eval_freq: Evaluate the agent every ``eval_freq`` timesteps (this may vary a little).
-            Caution, this parameter is deprecated and will be removed in the future.
-            Please use `MaskableEvalCallback` or a custom Callback instead.
-        :param n_eval_episodes: How many episodes to play per evaluation
-        :param log_path: Path to a folder where the evaluations will be saved
         :param reset_num_timesteps: Whether to reset or not the ``num_timesteps`` attribute
         :param tb_log_name: the name of the run for tensorboard log
         :param use_masking: Whether or not to use invalid action masks during training
@@ -264,17 +218,6 @@ class MaskablePPO(OnPolicyAlgorithm):
         :return:
         """
 
-        if eval_env is not None or eval_freq != -1:
-            warnings.warn(
-                "Parameters `eval_env` and `eval_freq` are deprecated and will be removed in the future. "
-                "Please use `MaskableEvalCallback` or a custom Callback instead.",
-                DeprecationWarning,
-                # By setting the `stacklevel` we refer to the initial caller of the deprecated feature.
-                # This causes the the `DepricationWarning` to not be ignored and to be shown to the user. See
-                # https://github.com/DLR-RM/stable-baselines3/pull/1082#discussion_r989842855 for more details.
-                stacklevel=4,
-            )
-
         self.start_time = time.time_ns()
         if self.ep_info_buffer is None or reset_num_timesteps:
             # Initialize buffers if they don't exist, or reinitialize if resetting counters
@@ -297,17 +240,12 @@ class MaskablePPO(OnPolicyAlgorithm):
             if self._vec_normalize_env is not None:
                 self._last_original_obs = self._vec_normalize_env.get_original_obs()
 
-        if eval_env is not None and self.seed is not None:
-            eval_env.seed(self.seed)
-
-        eval_env = self._get_eval_env(eval_env)
-
         # Configure logger's outputs if no logger was passed
         if not self._custom_logger:
             self._logger = utils.configure_logger(self.verbose, self.tensorboard_log, tb_log_name, reset_num_timesteps)
 
         # Create eval callback if needed
-        callback = self._init_callback(callback, eval_env, eval_freq, n_eval_episodes, log_path, use_masking, progress_bar)
+        callback = self._init_callback(callback, use_masking, progress_bar)
 
         return total_timesteps, callback
 
@@ -564,11 +502,7 @@ class MaskablePPO(OnPolicyAlgorithm):
         total_timesteps: int,
         callback: MaybeCallback = None,
         log_interval: int = 1,
-        eval_env: Optional[GymEnv] = None,
-        eval_freq: int = -1,
-        n_eval_episodes: int = 5,
         tb_log_name: str = "PPO",
-        eval_log_path: Optional[str] = None,
         reset_num_timesteps: bool = True,
         use_masking: bool = True,
         progress_bar: bool = False,
@@ -577,11 +511,7 @@ class MaskablePPO(OnPolicyAlgorithm):
 
         total_timesteps, callback = self._setup_learn(
             total_timesteps,
-            eval_env,
             callback,
-            eval_freq,
-            n_eval_episodes,
-            eval_log_path,
             reset_num_timesteps,
             tb_log_name,
             use_masking,
diff --git a/sb3_contrib/ppo_recurrent/ppo_recurrent.py b/sb3_contrib/ppo_recurrent/ppo_recurrent.py
index 965e008..da35516 100644
--- a/sb3_contrib/ppo_recurrent/ppo_recurrent.py
+++ b/sb3_contrib/ppo_recurrent/ppo_recurrent.py
@@ -58,9 +58,6 @@ class RecurrentPPO(OnPolicyAlgorithm):
         see issue #213 (cf https://github.com/hill-a/stable-baselines/issues/213)
         By default, there is no limit on the kl div.
     :param tensorboard_log: the log location for tensorboard (if None, no logging)
-    :param create_eval_env: Whether to create a second environment that will be
-        used for evaluating the agent periodically (Only available when passing string for the environment).
-        Caution, this parameter is deprecated and will be removed in the future.
     :param policy_kwargs: additional arguments to be passed to the policy on creation
     :param verbose: the verbosity level: 0 no output, 1 info, 2 debug
     :param seed: Seed for the pseudo random generators
@@ -95,7 +92,6 @@ class RecurrentPPO(OnPolicyAlgorithm):
         sde_sample_freq: int = -1,
         target_kl: Optional[float] = None,
         tensorboard_log: Optional[str] = None,
-        create_eval_env: bool = False,
         policy_kwargs: Optional[Dict[str, Any]] = None,
         verbose: int = 0,
         seed: Optional[int] = None,
@@ -115,7 +111,6 @@ class RecurrentPPO(OnPolicyAlgorithm):
             use_sde=use_sde,
             sde_sample_freq=sde_sample_freq,
             tensorboard_log=tensorboard_log,
-            create_eval_env=create_eval_env,
             policy_kwargs=policy_kwargs,
             verbose=verbose,
             seed=seed,
@@ -453,11 +448,7 @@ class RecurrentPPO(OnPolicyAlgorithm):
         total_timesteps: int,
         callback: MaybeCallback = None,
         log_interval: int = 1,
-        eval_env: Optional[GymEnv] = None,
-        eval_freq: int = -1,
-        n_eval_episodes: int = 5,
         tb_log_name: str = "RecurrentPPO",
-        eval_log_path: Optional[str] = None,
         reset_num_timesteps: bool = True,
         progress_bar: bool = False,
     ) -> RecurrentPPOSelf:
@@ -465,11 +456,7 @@ class RecurrentPPO(OnPolicyAlgorithm):
 
         total_timesteps, callback = self._setup_learn(
             total_timesteps,
-            eval_env,
             callback,
-            eval_freq,
-            n_eval_episodes,
-            eval_log_path,
             reset_num_timesteps,
             tb_log_name,
             progress_bar,
diff --git a/sb3_contrib/qrdqn/qrdqn.py b/sb3_contrib/qrdqn/qrdqn.py
index f521219..d4aef00 100644
--- a/sb3_contrib/qrdqn/qrdqn.py
+++ b/sb3_contrib/qrdqn/qrdqn.py
@@ -50,9 +50,6 @@ class QRDQN(OffPolicyAlgorithm):
     :param exploration_final_eps: final value of random action probability
     :param max_grad_norm: The maximum value for the gradient clipping (if None, no clipping)
     :param tensorboard_log: the log location for tensorboard (if None, no logging)
-    :param create_eval_env: Whether to create a second environment that will be
-        used for evaluating the agent periodically (Only available when passing string for the environment).
-        Caution, this parameter is deprecated and will be removed in the future.
     :param policy_kwargs: additional arguments to be passed to the policy on creation
     :param verbose: the verbosity level: 0 no output, 1 info, 2 debug
     :param seed: Seed for the pseudo random generators
@@ -88,7 +85,6 @@ class QRDQN(OffPolicyAlgorithm):
         exploration_final_eps: float = 0.01,
         max_grad_norm: Optional[float] = None,
         tensorboard_log: Optional[str] = None,
-        create_eval_env: bool = False,
         policy_kwargs: Optional[Dict[str, Any]] = None,
         verbose: int = 0,
         seed: Optional[int] = None,
@@ -114,7 +110,6 @@ class QRDQN(OffPolicyAlgorithm):
             tensorboard_log=tensorboard_log,
             verbose=verbose,
             device=device,
-            create_eval_env=create_eval_env,
             seed=seed,
             sde_support=False,
             optimize_memory_usage=optimize_memory_usage,
@@ -256,11 +251,7 @@ class QRDQN(OffPolicyAlgorithm):
         total_timesteps: int,
         callback: MaybeCallback = None,
         log_interval: int = 4,
-        eval_env: Optional[GymEnv] = None,
-        eval_freq: int = -1,
-        n_eval_episodes: int = 5,
         tb_log_name: str = "QRDQN",
-        eval_log_path: Optional[str] = None,
         reset_num_timesteps: bool = True,
         progress_bar: bool = False,
     ) -> QRDQNSelf:
@@ -269,11 +260,7 @@ class QRDQN(OffPolicyAlgorithm):
             total_timesteps=total_timesteps,
             callback=callback,
             log_interval=log_interval,
-            eval_env=eval_env,
-            eval_freq=eval_freq,
-            n_eval_episodes=n_eval_episodes,
             tb_log_name=tb_log_name,
-            eval_log_path=eval_log_path,
             reset_num_timesteps=reset_num_timesteps,
             progress_bar=progress_bar,
         )
diff --git a/sb3_contrib/tqc/policies.py b/sb3_contrib/tqc/policies.py
index e8022f9..e2266f3 100644
--- a/sb3_contrib/tqc/policies.py
+++ b/sb3_contrib/tqc/policies.py
@@ -1,4 +1,3 @@
-import warnings
 from typing import Any, Dict, List, Optional, Tuple, Type, Union
 
 import gym
@@ -37,9 +36,6 @@ class Actor(BasePolicy):
     :param log_std_init: Initial value for the log standard deviation
     :param full_std: Whether to use (n_features x n_actions) parameters
         for the std instead of only (n_features,) when using gSDE.
-    :param sde_net_arch: Network architecture for extracting features
-        when using gSDE. If None, the latent features from the policy will be used.
-        Pass an empty list to use the states as features.
     :param use_expln: Use ``expln()`` function instead of ``exp()`` when using gSDE to ensure
         a positive standard deviation (cf paper). It allows to keep variance
         above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
@@ -59,7 +55,6 @@ class Actor(BasePolicy):
         use_sde: bool = False,
         log_std_init: float = -3,
         full_std: bool = True,
-        sde_net_arch: Optional[List[int]] = None,
         use_expln: bool = False,
         clip_mean: float = 2.0,
         normalize_images: bool = True,
@@ -79,7 +74,6 @@ class Actor(BasePolicy):
         self.features_dim = features_dim
         self.activation_fn = activation_fn
         self.log_std_init = log_std_init
-        self.sde_net_arch = sde_net_arch
         self.use_expln = use_expln
         self.full_std = full_std
         self.clip_mean = clip_mean
@@ -89,9 +83,6 @@ class Actor(BasePolicy):
         self.latent_pi = nn.Sequential(*latent_pi_net)
         last_layer_dim = net_arch[-1] if len(net_arch) > 0 else features_dim
 
-        if sde_net_arch is not None:
-            warnings.warn("sde_net_arch is deprecated and will be removed in SB3 v2.4.0.", DeprecationWarning)
-
         if self.use_sde:
             self.action_dist = StateDependentNoiseDistribution(
                 action_dim, full_std=full_std, use_expln=use_expln, learn_features=True, squash_output=True
@@ -256,9 +247,6 @@ class TQCPolicy(BasePolicy):
     :param activation_fn: Activation function
     :param use_sde: Whether to use State Dependent Exploration or not
     :param log_std_init: Initial value for the log standard deviation
-    :param sde_net_arch: Network architecture for extracting features
-        when using gSDE. If None, the latent features from the policy will be used.
-        Pass an empty list to use the states as features.
     :param use_expln: Use ``expln()`` function instead of ``exp()`` when using gSDE to ensure
         a positive standard deviation (cf paper). It allows to keep variance
         above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
@@ -287,7 +275,6 @@ class TQCPolicy(BasePolicy):
         activation_fn: Type[nn.Module] = nn.ReLU,
         use_sde: bool = False,
         log_std_init: float = -3,
-        sde_net_arch: Optional[List[int]] = None,
         use_expln: bool = False,
         clip_mean: float = 2.0,
         features_extractor_class: Type[BaseFeaturesExtractor] = FlattenExtractor,
@@ -325,9 +312,6 @@ class TQCPolicy(BasePolicy):
         }
         self.actor_kwargs = self.net_args.copy()
 
-        if sde_net_arch is not None:
-            warnings.warn("sde_net_arch is deprecated and will be removed in SB3 v2.4.0.", DeprecationWarning)
-
         sde_kwargs = {
             "use_sde": use_sde,
             "log_std_init": log_std_init,
@@ -442,9 +426,6 @@ class CnnPolicy(TQCPolicy):
     :param activation_fn: Activation function
     :param use_sde: Whether to use State Dependent Exploration or not
     :param log_std_init: Initial value for the log standard deviation
-    :param sde_net_arch: Network architecture for extracting features
-        when using gSDE. If None, the latent features from the policy will be used.
-        Pass an empty list to use the states as features.
     :param use_expln: Use ``expln()`` function instead of ``exp()`` when using gSDE to ensure
         a positive standard deviation (cf paper). It allows to keep variance
         above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
@@ -471,7 +452,6 @@ class CnnPolicy(TQCPolicy):
         activation_fn: Type[nn.Module] = nn.ReLU,
         use_sde: bool = False,
         log_std_init: float = -3,
-        sde_net_arch: Optional[List[int]] = None,
         use_expln: bool = False,
         clip_mean: float = 2.0,
         features_extractor_class: Type[BaseFeaturesExtractor] = NatureCNN,
@@ -491,7 +471,6 @@ class CnnPolicy(TQCPolicy):
             activation_fn,
             use_sde,
             log_std_init,
-            sde_net_arch,
             use_expln,
             clip_mean,
             features_extractor_class,
@@ -516,9 +495,6 @@ class MultiInputPolicy(TQCPolicy):
     :param activation_fn: Activation function
     :param use_sde: Whether to use State Dependent Exploration or not
     :param log_std_init: Initial value for the log standard deviation
-    :param sde_net_arch: Network architecture for extracting features
-        when using gSDE. If None, the latent features from the policy will be used.
-        Pass an empty list to use the states as features.
     :param use_expln: Use ``expln()`` function instead of ``exp()`` when using gSDE to ensure
         a positive standard deviation (cf paper). It allows to keep variance
         above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
@@ -545,7 +521,6 @@ class MultiInputPolicy(TQCPolicy):
         activation_fn: Type[nn.Module] = nn.ReLU,
         use_sde: bool = False,
         log_std_init: float = -3,
-        sde_net_arch: Optional[List[int]] = None,
         use_expln: bool = False,
         clip_mean: float = 2.0,
         features_extractor_class: Type[BaseFeaturesExtractor] = CombinedExtractor,
@@ -565,7 +540,6 @@ class MultiInputPolicy(TQCPolicy):
             activation_fn,
             use_sde,
             log_std_init,
-            sde_net_arch,
             use_expln,
             clip_mean,
             features_extractor_class,
diff --git a/sb3_contrib/tqc/tqc.py b/sb3_contrib/tqc/tqc.py
index df65496..7f95baa 100644
--- a/sb3_contrib/tqc/tqc.py
+++ b/sb3_contrib/tqc/tqc.py
@@ -57,9 +57,6 @@ class TQC(OffPolicyAlgorithm):
         Default: -1 (only sample at the beginning of the rollout)
     :param use_sde_at_warmup: Whether to use gSDE instead of uniform sampling
         during the warm up phase (before learning starts)
-    :param create_eval_env: Whether to create a second environment that will be
-        used for evaluating the agent periodically (Only available when passing string for the environment).
-        Caution, this parameter is deprecated and will be removed in the future.
     :param policy_kwargs: additional arguments to be passed to the policy on creation
     :param verbose: the verbosity level: 0 no output, 1 info, 2 debug
     :param seed: Seed for the pseudo random generators
@@ -98,7 +95,6 @@ class TQC(OffPolicyAlgorithm):
         sde_sample_freq: int = -1,
         use_sde_at_warmup: bool = False,
         tensorboard_log: Optional[str] = None,
-        create_eval_env: bool = False,
         policy_kwargs: Optional[Dict[str, Any]] = None,
         verbose: int = 0,
         seed: Optional[int] = None,
@@ -124,7 +120,6 @@ class TQC(OffPolicyAlgorithm):
             tensorboard_log=tensorboard_log,
             verbose=verbose,
             device=device,
-            create_eval_env=create_eval_env,
             seed=seed,
             use_sde=use_sde,
             sde_sample_freq=sde_sample_freq,
@@ -293,11 +288,7 @@ class TQC(OffPolicyAlgorithm):
         total_timesteps: int,
         callback: MaybeCallback = None,
         log_interval: int = 4,
-        eval_env: Optional[GymEnv] = None,
-        eval_freq: int = -1,
-        n_eval_episodes: int = 5,
         tb_log_name: str = "TQC",
-        eval_log_path: Optional[str] = None,
         reset_num_timesteps: bool = True,
         progress_bar: bool = False,
     ) -> TQCSelf:
@@ -306,11 +297,7 @@ class TQC(OffPolicyAlgorithm):
             total_timesteps=total_timesteps,
             callback=callback,
             log_interval=log_interval,
-            eval_env=eval_env,
-            eval_freq=eval_freq,
-            n_eval_episodes=n_eval_episodes,
             tb_log_name=tb_log_name,
-            eval_log_path=eval_log_path,
             reset_num_timesteps=reset_num_timesteps,
             progress_bar=progress_bar,
         )
diff --git a/sb3_contrib/trpo/trpo.py b/sb3_contrib/trpo/trpo.py
index 7d93130..d8ae9be 100644
--- a/sb3_contrib/trpo/trpo.py
+++ b/sb3_contrib/trpo/trpo.py
@@ -59,9 +59,6 @@ class TRPO(OnPolicyAlgorithm):
     :param sub_sampling_factor: Sub-sample the batch to make computation faster
         see p40-42 of John Schulman thesis http://joschu.net/docs/thesis.pdf
     :param tensorboard_log: the log location for tensorboard (if None, no logging)
-    :param create_eval_env: Whether to create a second environment that will be
-        used for evaluating the agent periodically (Only available when passing string for the environment).
-        Caution, this parameter is deprecated and will be removed in the future.
     :param policy_kwargs: additional arguments to be passed to the policy on creation
     :param verbose: the verbosity level: 0 no output, 1 info, 2 debug
     :param seed: Seed for the pseudo random generators
@@ -96,7 +93,6 @@ class TRPO(OnPolicyAlgorithm):
         target_kl: float = 0.01,
         sub_sampling_factor: int = 1,
         tensorboard_log: Optional[str] = None,
-        create_eval_env: bool = False,
         policy_kwargs: Optional[Dict[str, Any]] = None,
         verbose: int = 0,
         seed: Optional[int] = None,
@@ -120,7 +116,6 @@ class TRPO(OnPolicyAlgorithm):
             policy_kwargs=policy_kwargs,
             verbose=verbose,
             device=device,
-            create_eval_env=create_eval_env,
             seed=seed,
             _init_setup_model=False,
             supported_action_spaces=(
@@ -409,11 +404,7 @@ class TRPO(OnPolicyAlgorithm):
         total_timesteps: int,
         callback: MaybeCallback = None,
         log_interval: int = 1,
-        eval_env: Optional[GymEnv] = None,
-        eval_freq: int = -1,
-        n_eval_episodes: int = 5,
         tb_log_name: str = "TRPO",
-        eval_log_path: Optional[str] = None,
         reset_num_timesteps: bool = True,
         progress_bar: bool = False,
     ) -> TRPOSelf:
@@ -422,11 +413,7 @@ class TRPO(OnPolicyAlgorithm):
             total_timesteps=total_timesteps,
             callback=callback,
             log_interval=log_interval,
-            eval_env=eval_env,
-            eval_freq=eval_freq,
-            n_eval_episodes=n_eval_episodes,
             tb_log_name=tb_log_name,
-            eval_log_path=eval_log_path,
             reset_num_timesteps=reset_num_timesteps,
             progress_bar=progress_bar,
         )
diff --git a/sb3_contrib/version.txt b/sb3_contrib/version.txt
index fdd3be6..56fee06 100644
--- a/sb3_contrib/version.txt
+++ b/sb3_contrib/version.txt
@@ -1 +1 @@
-1.6.2
+1.7.0a0
diff --git a/setup.py b/setup.py
index 7c6b397..14214f7 100644
--- a/setup.py
+++ b/setup.py
@@ -65,7 +65,7 @@ setup(
     packages=[package for package in find_packages() if package.startswith("sb3_contrib")],
     package_data={"sb3_contrib": ["py.typed", "version.txt"]},
     install_requires=[
-        "stable_baselines3>=1.6.2",
+        "stable_baselines3>=1.7.0a0",
     ],
     description="Contrib package of Stable Baselines3, experimental code.",
     author="Antonin Raffin",
diff --git a/tests/test_invalid_actions.py b/tests/test_invalid_actions.py
index 30f2cff..57138f4 100644
--- a/tests/test_invalid_actions.py
+++ b/tests/test_invalid_actions.py
@@ -64,15 +64,6 @@ def test_bootstraping():
     model.learn(128)
 
 
-def test_eval_env():
-    env = InvalidActionEnvDiscrete(dim=20, n_invalid_actions=10)
-    eval_env = InvalidActionEnvDiscrete(dim=20, n_invalid_actions=10)
-    model = MaskablePPO("MlpPolicy", env, clip_range_vf=0.2, n_steps=32, seed=8)
-    with pytest.warns(DeprecationWarning):  # `eval_env` is deprecated
-        model.learn(32, eval_env=eval_env, eval_freq=16)
-    model.learn(32, reset_num_timesteps=False)
-
-
 def test_supports_discrete_action_space():
     """
     No errors using algorithm with an env that has a discrete action space
diff --git a/tests/test_lstm.py b/tests/test_lstm.py
index 799b8dc..29f5ef2 100644
--- a/tests/test_lstm.py
+++ b/tests/test_lstm.py
@@ -129,32 +129,28 @@ def test_check():
 
 @pytest.mark.parametrize("env", ["Pendulum-v1", "CartPole-v1"])
 def test_run(env):
-    with pytest.warns(DeprecationWarning):  # `create_eval_env` and `eval_freq` are deprecated
-        model = RecurrentPPO(
-            "MlpLstmPolicy",
-            env,
-            n_steps=16,
-            seed=0,
-            create_eval_env=True,
-        )
+    model = RecurrentPPO(
+        "MlpLstmPolicy",
+        env,
+        n_steps=16,
+        seed=0,
+    )
 
-        model.learn(total_timesteps=32, eval_freq=16)
+    model.learn(total_timesteps=32)
 
 
 def test_run_sde():
-    with pytest.warns(DeprecationWarning):  # `create_eval_env` and `eval_freq` are deprecated
-        model = RecurrentPPO(
-            "MlpLstmPolicy",
-            "Pendulum-v1",
-            n_steps=16,
-            seed=0,
-            create_eval_env=True,
-            sde_sample_freq=4,
-            use_sde=True,
-            clip_range_vf=0.1,
-        )
+    model = RecurrentPPO(
+        "MlpLstmPolicy",
+        "Pendulum-v1",
+        n_steps=16,
+        seed=0,
+        sde_sample_freq=4,
+        use_sde=True,
+        clip_range_vf=0.1,
+    )
 
-        model.learn(total_timesteps=200, eval_freq=150)
+    model.learn(total_timesteps=200)
 
 
 @pytest.mark.parametrize(
diff --git a/tests/test_run.py b/tests/test_run.py
index 09a0090..5238522 100644
--- a/tests/test_run.py
+++ b/tests/test_run.py
@@ -10,17 +10,15 @@ from sb3_contrib.common.vec_env import AsyncEval
 
 @pytest.mark.parametrize("ent_coef", ["auto", 0.01, "auto_0.01"])
 def test_tqc(ent_coef):
-    with pytest.warns(DeprecationWarning):  # `create_eval_env` and `eval_freq` are deprecated
-        model = TQC(
-            "MlpPolicy",
-            "Pendulum-v1",
-            policy_kwargs=dict(net_arch=[64, 64]),
-            learning_starts=100,
-            verbose=1,
-            create_eval_env=True,
-            ent_coef=ent_coef,
-        )
-        model.learn(total_timesteps=300, eval_freq=250, progress_bar=True)
+    model = TQC(
+        "MlpPolicy",
+        "Pendulum-v1",
+        policy_kwargs=dict(net_arch=[64, 64]),
+        learning_starts=100,
+        verbose=1,
+        ent_coef=ent_coef,
+    )
+    model.learn(total_timesteps=300, progress_bar=True)
 
 
 @pytest.mark.parametrize("n_critics", [1, 3])
@@ -51,18 +49,16 @@ def test_sde():
 
 
 def test_qrdqn():
-    with pytest.warns(DeprecationWarning):  # `create_eval_env` and `eval_freq` are deprecated
-        model = QRDQN(
-            "MlpPolicy",
-            "CartPole-v1",
-            policy_kwargs=dict(n_quantiles=25, net_arch=[64, 64]),
-            learning_starts=100,
-            buffer_size=500,
-            learning_rate=3e-4,
-            verbose=1,
-            create_eval_env=True,
-        )
-        model.learn(total_timesteps=500, eval_freq=250)
+    model = QRDQN(
+        "MlpPolicy",
+        "CartPole-v1",
+        policy_kwargs=dict(n_quantiles=25, net_arch=[64, 64]),
+        learning_starts=100,
+        buffer_size=500,
+        learning_rate=3e-4,
+        verbose=1,
+    )
+    model.learn(total_timesteps=500)
 
 
 @pytest.mark.parametrize("env_id", ["CartPole-v1", "Pendulum-v1"])
@@ -91,8 +87,7 @@ def test_trpo_params():
 @pytest.mark.parametrize("policy_str", ["LinearPolicy", "MlpPolicy"])
 def test_ars(policy_str, env_id):
     model = ARS(policy_str, env_id, n_delta=1, verbose=1, seed=0)
-    with pytest.warns(DeprecationWarning):  # `create_eval_env` and `eval_freq` are deprecated
-        model.learn(total_timesteps=500, log_interval=1, eval_freq=250)
+    model.learn(total_timesteps=500, log_interval=1)
 
 
 def test_ars_multi_env():