Sync SB3 Contrib with SB3 (#213)

* Update RTD config * Switch to ruff for sorting imports * Evaluate falsy to truthy with not rather than `is False` * Add `features_extractor` argument to maskable policy * Add set_options for AsyncEval * Doc fixes
2023-10-25 14:32:51 +02:00 · 2023-10-25 14:32:51 +02:00 · 4d7ed004af
parent 5be11deaf3
commit 4d7ed004af
14 changed files with 79 additions and 28 deletions
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@ -14,3 +14,8 @@ formats: all
 # Set requirements using conda env
 conda:
  environment: docs/conda_env.yml
 build:
  os: ubuntu-22.04
  tools:
    python: "mambaforge-22.9"
--- a/4
+++ b/4
@ -22,13 +22,13 @@ lint:
 format:
 	# Sort imports
-	isort ${LINT_PATHS}
+	ruff --select I ${LINT_PATHS} --fix
 	# Reformat using black
 	black ${LINT_PATHS}
 check-codestyle:
 	# Sort imports
-	isort --check ${LINT_PATHS}
+	ruff --select I ${LINT_PATHS}
 	# Reformat using black
 	black --check ${LINT_PATHS}
--- a/docs/conda_env.yml
+++ b/docs/conda_env.yml
@ -13,8 +13,7 @@ dependencies:
    - pandas
    - numpy
    - matplotlib
    - sphinx_autodoc_typehints
    - stable-baselines3>=2.0.0
-    - sphinx>=5.3,<7.0
+    - sphinx>=5,<8
-    - sphinx_rtd_theme>=1.0
+    - sphinx_rtd_theme>=1.3.0
    - sphinx_copybutton
--- a/docs/conf.py
+++ b/docs/conf.py
@ -64,7 +64,6 @@ release = __version__
 # ones.
 extensions = [
    "sphinx.ext.autodoc",
    "sphinx_autodoc_typehints",
    "sphinx.ext.autosummary",
    "sphinx.ext.mathjax",
    "sphinx.ext.ifconfig",
@ -73,6 +72,8 @@ extensions = [
    # 'sphinx.ext.doctest'
 ]
 autodoc_typehints = "description"
 if enable_spell_check:
    extensions.append("sphinxcontrib.spelling")
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@ -4,6 +4,34 @@ Changelog
 ==========
 Release 2.2.0a8 (WIP)
 --------------------------
 Breaking Changes:
 ^^^^^^^^^^^^^^^^^
 - Upgraded to Stable-Baselines3 >= 2.2.0
 - Switched to ``ruff`` for sorting imports (isort is no longer needed), black and ruff version now require a minimum version
 - Dropped ``x is False`` in favor of ``not x``, which means that callbacks that wrongly returned None (instead of a boolean) will cause the training to stop (@iwishiwasaneagle)
 New Features:
 ^^^^^^^^^^^^^
 - Added ``set_options`` for ``AsyncEval``
 Bug Fixes:
 ^^^^^^^^^^
 Deprecations:
 ^^^^^^^^^^^^^
 Others:
 ^^^^^^^
 - Fixed ``ActorCriticPolicy.extract_features()`` signature by adding an optional ``features_extractor`` argument
 - Update dependencies (accept newer Shimmy/Sphinx version and remove ``sphinx_autodoc_typehints``)
 Documentation:
 ^^^^^^^^^^^^^^
 Release 2.1.0 (2023-08-17)
 --------------------------
--- a/docs/modules/trpo.rst
+++ b/docs/modules/trpo.rst
@ -1,4 +1,4 @@
-.. _tqc:
+.. _trpo:
 .. automodule:: sb3_contrib.trpo
@ -105,7 +105,7 @@ Run the benchmark (replace ``$ENV_ID`` by the envs mentioned above):
 .. code-block:: bash
-  python train.py --algo tqc --env $ENV_ID --n-eval-envs 10 --eval-episodes 20 --eval-freq 50000
+  python train.py --algo trpo --env $ENV_ID --n-eval-envs 10 --eval-episodes 20 --eval-freq 50000
 Plot the results:
--- a/pyproject.toml
+++ b/pyproject.toml
@ -19,11 +19,6 @@ max-complexity = 15
 [tool.black]
 line-length = 127
 [tool.isort]
 profile = "black"
 line_length = 127
 src_paths = ["sb3_contrib"]
 [tool.pytype]
 inputs = ['sb3_contrib']
--- a/sb3_contrib/common/maskable/policies.py
+++ b/sb3_contrib/common/maskable/policies.py
@ -141,15 +141,26 @@ class MaskableActorCriticPolicy(BasePolicy):
        log_prob = distribution.log_prob(actions)
        return actions, values, log_prob
-    def extract_features(self, obs: th.Tensor) -> Union[th.Tensor, Tuple[th.Tensor, th.Tensor]]:
+    def extract_features(
        self, obs: th.Tensor, features_extractor: Optional[BaseFeaturesExtractor] = None
    ) -> Union[th.Tensor, Tuple[th.Tensor, th.Tensor]]:
        """
        Preprocess the observation if needed and extract features.
        :param obs: Observation
-        :return: the output of the features extractor(s)
+        :param features_extractor: The features extractor to use. If None, then ``self.features_extractor`` is used.
        :return: The extracted features. If features extractor is not shared, returns a tuple with the
            features for the actor and the features for the critic.
        """
        if self.share_features_extractor:
-            return super().extract_features(obs, self.features_extractor)
+            return super().extract_features(obs, features_extractor or self.features_extractor)
        else:
            if features_extractor is not None:
                warnings.warn(
                    "Provided features_extractor will be ignored because the features extractor is not shared.",
                    UserWarning,
                )
            pi_features = super().extract_features(obs, self.pi_features_extractor)
            vf_features = super().extract_features(obs, self.vf_features_extractor)
            return pi_features, vf_features
--- a/sb3_contrib/common/vec_env/async_eval.py
+++ b/sb3_contrib/common/vec_env/async_eval.py
@ -1,6 +1,6 @@
 import multiprocessing as mp
 from collections import defaultdict
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, Dict, List, Optional, Tuple, Union
 import numpy as np
 import torch as th
@ -57,6 +57,9 @@ def _worker(
            elif cmd == "seed":
                # Note: the seed will only be effective at the next reset
                remote.send(vec_env.seed(seed=data))
            elif cmd == "set_options":
                # Note: the options will only be effective at the next reset
                remote.send(vec_env.set_options(data))
            elif cmd == "get_obs_rms":
                remote.send(obs_rms)
            elif cmd == "sync_obs_rms":
@ -158,6 +161,19 @@ class AsyncEval:
            remote.send(("seed", seed + idx))
        return [remote.recv() for remote in self.remotes]
    def set_options(self, options: Optional[Union[List[Dict], Dict]] = None) -> List[Union[None, int]]:
        """
        Set environment options for all environments.
        If a dict is passed instead of a list, the same options will be used for all environments.
        WARNING: Those options will only be passed to the environment at the next reset.
        :param options: A dictionary of environment options to pass to each environment at the next reset.
        :return:
        """
        for remote in self.remotes:
            remote.send(("set_options", options))
        return [remote.recv() for remote in self.remotes]
    def get_results(self) -> List[Tuple[int, Tuple[np.ndarray, np.ndarray]]]:
        """
        Retreive episode rewards and lengths from each worker
--- a/sb3_contrib/ppo_mask/ppo_mask.py
+++ b/sb3_contrib/ppo_mask/ppo_mask.py
@ -309,7 +309,7 @@ class MaskablePPO(OnPolicyAlgorithm):
            # Give access to local variables
            callback.update_locals(locals())
-            if callback.on_step() is False:
+            if not callback.on_step():
                return False
            self._update_info_buffer(infos)
@ -525,7 +525,7 @@ class MaskablePPO(OnPolicyAlgorithm):
        while self.num_timesteps < total_timesteps:
            continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, self.n_steps, use_masking)
-            if continue_training is False:
+            if not continue_training:
                break
            iteration += 1
--- a/sb3_contrib/ppo_recurrent/ppo_recurrent.py
+++ b/sb3_contrib/ppo_recurrent/ppo_recurrent.py
@ -257,7 +257,7 @@ class RecurrentPPO(OnPolicyAlgorithm):
            # Give access to local variables
            callback.update_locals(locals())
-            if callback.on_step() is False:
+            if not callback.on_step():
                return False
            self._update_info_buffer(infos)
@ -468,7 +468,7 @@ class RecurrentPPO(OnPolicyAlgorithm):
        while self.num_timesteps < total_timesteps:
            continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps)
-            if continue_training is False:
+            if not continue_training:
                break
            iteration += 1
--- a/sb3_contrib/version.txt
+++ b/sb3_contrib/version.txt
@ -1 +1 @@
-2.1.0
+2.2.0a8
--- a/setup.py
+++ b/setup.py
@ -65,7 +65,7 @@ setup(
    packages=[package for package in find_packages() if package.startswith("sb3_contrib")],
    package_data={"sb3_contrib": ["py.typed", "version.txt"]},
    install_requires=[
-        "stable_baselines3>=2.1.0",
+        "stable_baselines3>=2.2.0a8,<3.0",
    ],
    description="Contrib package of Stable Baselines3, experimental code.",
    author="Antonin Raffin",
@ -95,8 +95,3 @@ setup(
        "Programming Language :: Python :: 3.11",
    ],
 )
 # python setup.py sdist
 # python setup.py bdist_wheel
 # twine upload --repository-url https://test.pypi.org/legacy/ dist/*
 # twine upload dist/*
--- a/tests/test_run.py
+++ b/tests/test_run.py
@ -111,6 +111,7 @@ def test_ars_multi_env():
    # with parallelism
    async_eval = AsyncEval([lambda: VecNormalize(make_vec_env("Pendulum-v1", n_envs=1)) for _ in range(2)], model.policy)
    async_eval.seed(0)
    async_eval.set_options()
    model.learn(500, async_eval=async_eval)