From 901a64850737ee27b0d67fa0a81f632a52c5fc7b Mon Sep 17 00:00:00 2001 From: Adam Gleave Date: Tue, 22 Feb 2022 15:25:43 +0000 Subject: [PATCH] Upgrade Gym to 0.21 (#59) * Pendulum-v0 -> Pendulum-v1 * Reformat with black * Update changelog * Fix dtype bug in TimeFeatureWrapper * Update version and removed forward calls * Update CI * Fix min version Co-authored-by: Antonin Raffin --- .github/workflows/ci.yml | 2 -- docs/guide/examples.rst | 6 +++--- docs/misc/changelog.rst | 21 ++++++++++++++++++ docs/modules/ars.rst | 2 +- docs/modules/tqc.rst | 2 +- docs/modules/trpo.rst | 2 +- sb3_contrib/ars/policies.py | 2 +- sb3_contrib/common/utils.py | 2 +- sb3_contrib/common/wrappers/time_feature.py | 6 ++++-- sb3_contrib/ppo_mask/ppo_mask.py | 2 +- sb3_contrib/qrdqn/policies.py | 2 +- sb3_contrib/tqc/policies.py | 2 +- sb3_contrib/version.txt | 2 +- setup.py | 2 +- tests/test_deterministic.py | 2 +- tests/test_distributions.py | 4 ++-- tests/test_run.py | 24 ++++++++++----------- tests/test_save_load.py | 4 ++-- tests/test_train_eval_mode.py | 6 +++--- tests/test_utils.py | 8 +++---- tests/wrappers/test_time_feature.py | 4 ++-- 21 files changed, 64 insertions(+), 43 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 56cdf00..b981df6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -36,8 +36,6 @@ jobs: pip install . # Use headless version pip install opencv-python-headless - # Tmp fix: ROM missing in the newest atari-py version - pip install atari-py==0.2.5 - name: Build the doc run: | make doc diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst index 6a60be8..d33d4a9 100644 --- a/docs/guide/examples.rst +++ b/docs/guide/examples.rst @@ -12,7 +12,7 @@ Train a Truncated Quantile Critics (TQC) agent on the Pendulum environment. from sb3_contrib import TQC - model = TQC("MlpPolicy", "Pendulum-v0", top_quantiles_to_drop_per_net=2, verbose=1) + model = TQC("MlpPolicy", "Pendulum-v1", top_quantiles_to_drop_per_net=2, verbose=1) model.learn(total_timesteps=10_000, log_interval=4) model.save("tqc_pendulum") @@ -54,7 +54,7 @@ Train a Trust Region Policy Optimization (TRPO) agent on the Pendulum environmen from sb3_contrib import TRPO - model = TRPO("MlpPolicy", "Pendulum-v0", gamma=0.9, verbose=1) + model = TRPO("MlpPolicy", "Pendulum-v1", gamma=0.9, verbose=1) model.learn(total_timesteps=100_000, log_interval=4) model.save("trpo_pendulum") @@ -68,6 +68,6 @@ Train an agent using Augmented Random Search (ARS) agent on the Pendulum environ from sb3_contrib import ARS - model = ARS("LinearPolicy", "Pendulum-v0", verbose=1) + model = ARS("LinearPolicy", "Pendulum-v1", verbose=1) model.learn(total_timesteps=10000, log_interval=4) model.save("ars_pendulum") diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index 1367775..825b80c 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -3,6 +3,27 @@ Changelog ========== +Release 1.4.1a1 (WIP) +------------------------------- + + +Breaking Changes: +^^^^^^^^^^^^^^^^^ +- Switched minimum Gym version to 0.21.0. +- Upgraded to Stable-Baselines3 >= 1.4.1a1 + +New Features: +^^^^^^^^^^^^^ + +Bug Fixes: +^^^^^^^^^^ +- Removed explict calls to ``forward()`` method as per pytorch guidelines + +Deprecations: +^^^^^^^^^^^^^ + +Others: +^^^^^^^ Release 1.4.0 (2022-01-19) ------------------------------- diff --git a/docs/modules/ars.rst b/docs/modules/ars.rst index de258c2..699e308 100644 --- a/docs/modules/ars.rst +++ b/docs/modules/ars.rst @@ -64,7 +64,7 @@ Example from sb3_contrib import ARS # Policy can be LinearPolicy or MlpPolicy - model = ARS("LinearPolicy", "Pendulum-v0", verbose=1) + model = ARS("LinearPolicy", "Pendulum-v1", verbose=1) model.learn(total_timesteps=10000, log_interval=4) model.save("ars_pendulum") diff --git a/docs/modules/tqc.rst b/docs/modules/tqc.rst index 8341f86..b44d577 100644 --- a/docs/modules/tqc.rst +++ b/docs/modules/tqc.rst @@ -57,7 +57,7 @@ Example from sb3_contrib import TQC - env = gym.make("Pendulum-v0") + env = gym.make("Pendulum-v1") policy_kwargs = dict(n_critics=2, n_quantiles=25) model = TQC("MlpPolicy", env, top_quantiles_to_drop_per_net=2, verbose=1, policy_kwargs=policy_kwargs) diff --git a/docs/modules/trpo.rst b/docs/modules/trpo.rst index 4ff8828..90192c6 100644 --- a/docs/modules/trpo.rst +++ b/docs/modules/trpo.rst @@ -54,7 +54,7 @@ Example from sb3_contrib import TRPO - env = gym.make("Pendulum-v0") + env = gym.make("Pendulum-v1") model = TRPO("MlpPolicy", env, verbose=1) model.learn(total_timesteps=10000, log_interval=4) diff --git a/sb3_contrib/ars/policies.py b/sb3_contrib/ars/policies.py index e52ed15..e90927d 100644 --- a/sb3_contrib/ars/policies.py +++ b/sb3_contrib/ars/policies.py @@ -76,7 +76,7 @@ class ARSPolicy(BasePolicy): def _predict(self, observation: th.Tensor, deterministic: bool = True) -> th.Tensor: # Non deterministic action does not really make sense for ARS, we ignore this parameter for now.. - return self.forward(observation) + return self(observation) class ARSLinearPolicy(ARSPolicy): diff --git a/sb3_contrib/common/utils.py b/sb3_contrib/common/utils.py index daa6863..d380bb7 100644 --- a/sb3_contrib/common/utils.py +++ b/sb3_contrib/common/utils.py @@ -61,7 +61,7 @@ def quantile_huber_loss( # Note: in both cases, the loss has the same shape as pairwise_delta pairwise_delta = target_quantiles.unsqueeze(-2) - current_quantiles.unsqueeze(-1) abs_pairwise_delta = th.abs(pairwise_delta) - huber_loss = th.where(abs_pairwise_delta > 1, abs_pairwise_delta - 0.5, pairwise_delta ** 2 * 0.5) + huber_loss = th.where(abs_pairwise_delta > 1, abs_pairwise_delta - 0.5, pairwise_delta**2 * 0.5) loss = th.abs(cum_prob - (pairwise_delta.detach() < 0).float()) * huber_loss if sum_over_quantiles: loss = loss.sum(dim=-2).mean() diff --git a/sb3_contrib/common/wrappers/time_feature.py b/sb3_contrib/common/wrappers/time_feature.py index 57a8279..b63f19c 100644 --- a/sb3_contrib/common/wrappers/time_feature.py +++ b/sb3_contrib/common/wrappers/time_feature.py @@ -43,11 +43,12 @@ class TimeFeatureWrapper(gym.Wrapper): low, high = obs_space.low, obs_space.high low, high = np.concatenate((low, [0.0])), np.concatenate((high, [1.0])) + self.dtype = obs_space.dtype if isinstance(env.observation_space, gym.spaces.Dict): - env.observation_space.spaces["observation"] = gym.spaces.Box(low=low, high=high, dtype=np.float32) + env.observation_space.spaces["observation"] = gym.spaces.Box(low=low, high=high, dtype=self.dtype) else: - env.observation_space = gym.spaces.Box(low=low, high=high, dtype=np.float32) + env.observation_space = gym.spaces.Box(low=low, high=high, dtype=self.dtype) super(TimeFeatureWrapper, self).__init__(env) @@ -84,6 +85,7 @@ class TimeFeatureWrapper(gym.Wrapper): time_feature = 1 - (self._current_step / self._max_steps) if self._test_mode: time_feature = 1.0 + time_feature = np.array(time_feature, dtype=self.dtype) if isinstance(obs, dict): obs["observation"] = np.append(obs["observation"], time_feature) diff --git a/sb3_contrib/ppo_mask/ppo_mask.py b/sb3_contrib/ppo_mask/ppo_mask.py index 26c2d8e..767f36e 100644 --- a/sb3_contrib/ppo_mask/ppo_mask.py +++ b/sb3_contrib/ppo_mask/ppo_mask.py @@ -317,7 +317,7 @@ class MaskablePPO(OnPolicyAlgorithm): if use_masking: action_masks = get_action_masks(env) - actions, values, log_probs = self.policy.forward(obs_tensor, action_masks=action_masks) + actions, values, log_probs = self.policy(obs_tensor, action_masks=action_masks) actions = actions.cpu().numpy() new_obs, rewards, dones, infos = env.step(actions) diff --git a/sb3_contrib/qrdqn/policies.py b/sb3_contrib/qrdqn/policies.py index ba42ecf..d21b4ba 100644 --- a/sb3_contrib/qrdqn/policies.py +++ b/sb3_contrib/qrdqn/policies.py @@ -69,7 +69,7 @@ class QuantileNetwork(BasePolicy): return quantiles.view(-1, self.n_quantiles, self.action_space.n) def _predict(self, observation: th.Tensor, deterministic: bool = True) -> th.Tensor: - q_values = self.forward(observation).mean(dim=1) + q_values = self(observation).mean(dim=1) # Greedy action action = q_values.argmax(dim=1).reshape(-1) return action diff --git a/sb3_contrib/tqc/policies.py b/sb3_contrib/tqc/policies.py index 089a2f1..d098be0 100644 --- a/sb3_contrib/tqc/policies.py +++ b/sb3_contrib/tqc/policies.py @@ -181,7 +181,7 @@ class Actor(BasePolicy): return self.action_dist.log_prob_from_params(mean_actions, log_std, **kwargs) def _predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor: - return self.forward(observation, deterministic) + return self(observation, deterministic) class Critic(BaseModel): diff --git a/sb3_contrib/version.txt b/sb3_contrib/version.txt index 88c5fb8..d012e1c 100644 --- a/sb3_contrib/version.txt +++ b/sb3_contrib/version.txt @@ -1 +1 @@ -1.4.0 +1.4.1a1 diff --git a/setup.py b/setup.py index cb054b7..e3051b5 100644 --- a/setup.py +++ b/setup.py @@ -63,7 +63,7 @@ setup( packages=[package for package in find_packages() if package.startswith("sb3_contrib")], package_data={"sb3_contrib": ["py.typed", "version.txt"]}, install_requires=[ - "stable_baselines3>=1.4.0", + "stable_baselines3>=1.4.1a1", ], description="Contrib package of Stable Baselines3, experimental code.", author="Antonin Raffin", diff --git a/tests/test_deterministic.py b/tests/test_deterministic.py index 19e7175..1ba7283 100644 --- a/tests/test_deterministic.py +++ b/tests/test_deterministic.py @@ -17,7 +17,7 @@ def test_deterministic_training_common(algo): rewards = [[], []] # Smaller network kwargs = {"policy_kwargs": dict(net_arch=[64])} - env_id = "Pendulum-v0" + env_id = "Pendulum-v1" if algo == ARS_MULTI: algo = ARS ars_multi = True diff --git a/tests/test_distributions.py b/tests/test_distributions.py index 183952d..bb3cf26 100644 --- a/tests/test_distributions.py +++ b/tests/test_distributions.py @@ -222,7 +222,7 @@ class TestMaskableMultiCategoricalDistribution: assert len(dist.distributions) == NUM_CATS for i in range(NUM_CATS): assert (dist.distributions[i].probs == 0.5).all() - assert int(dist.entropy().exp()) == DIMS_PER_CAT ** NUM_CATS + assert int(dist.entropy().exp()) == DIMS_PER_CAT**NUM_CATS for i in range(DIMS_PER_CAT): mask = np.array([False] * DIMS_PER_CAT * NUM_CATS) @@ -240,7 +240,7 @@ class TestMaskableMultiCategoricalDistribution: dist.apply_masking(None) for i in range(NUM_CATS): assert (dist.distributions[i].probs == 0.5).all() - assert int(dist.entropy().exp()) == DIMS_PER_CAT ** NUM_CATS + assert int(dist.entropy().exp()) == DIMS_PER_CAT**NUM_CATS class TestMaskableBernoulliDistribution: diff --git a/tests/test_run.py b/tests/test_run.py index be8aef3..d6a2307 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -11,7 +11,7 @@ from sb3_contrib.common.vec_env import AsyncEval def test_tqc(ent_coef): model = TQC( "MlpPolicy", - "Pendulum-v0", + "Pendulum-v1", policy_kwargs=dict(net_arch=[64, 64]), learning_starts=100, verbose=1, @@ -26,7 +26,7 @@ def test_n_critics(n_critics): # Test TQC with different number of critics model = TQC( "MlpPolicy", - "Pendulum-v0", + "Pendulum-v1", policy_kwargs=dict(net_arch=[64], n_critics=n_critics), learning_starts=100, verbose=1, @@ -37,7 +37,7 @@ def test_n_critics(n_critics): def test_sde(): model = TQC( "MlpPolicy", - "Pendulum-v0", + "Pendulum-v1", policy_kwargs=dict(net_arch=[64]), use_sde=True, learning_starts=100, @@ -62,7 +62,7 @@ def test_qrdqn(): model.learn(total_timesteps=500, eval_freq=250) -@pytest.mark.parametrize("env_id", ["CartPole-v1", "Pendulum-v0"]) +@pytest.mark.parametrize("env_id", ["CartPole-v1", "Pendulum-v1"]) def test_trpo(env_id): model = TRPO("MlpPolicy", env_id, n_steps=128, seed=0, policy_kwargs=dict(net_arch=[16]), verbose=1) model.learn(total_timesteps=500) @@ -72,7 +72,7 @@ def test_trpo_params(): # Test with gSDE and subsampling model = TRPO( "MlpPolicy", - "Pendulum-v0", + "Pendulum-v1", n_steps=64, batch_size=32, use_sde=True, @@ -84,7 +84,7 @@ def test_trpo_params(): model.learn(total_timesteps=500) -@pytest.mark.parametrize("env_id", ["CartPole-v1", "Pendulum-v0"]) +@pytest.mark.parametrize("env_id", ["CartPole-v1", "Pendulum-v1"]) @pytest.mark.parametrize("policy_str", ["LinearPolicy", "MlpPolicy"]) def test_ars(policy_str, env_id): model = ARS(policy_str, env_id, n_delta=1, verbose=1, seed=0) @@ -92,14 +92,14 @@ def test_ars(policy_str, env_id): def test_ars_multi_env(): - env = make_vec_env("Pendulum-v0", n_envs=2) + env = make_vec_env("Pendulum-v1", n_envs=2) model = ARS("MlpPolicy", env, n_delta=1) model.learn(total_timesteps=250) - env = VecNormalize(make_vec_env("Pendulum-v0", n_envs=1)) + env = VecNormalize(make_vec_env("Pendulum-v1", n_envs=1)) model = ARS("MlpPolicy", env, n_delta=2, seed=0) # with parallelism - async_eval = AsyncEval([lambda: VecNormalize(make_vec_env("Pendulum-v0", n_envs=1)) for _ in range(2)], model.policy) + async_eval = AsyncEval([lambda: VecNormalize(make_vec_env("Pendulum-v1", n_envs=1)) for _ in range(2)], model.policy) async_eval.seed(0) model.learn(500, async_eval=async_eval) @@ -109,17 +109,17 @@ def test_ars_n_top(n_top): n_delta = 3 if n_top > n_delta: with pytest.warns(UserWarning): - model = ARS("MlpPolicy", "Pendulum-v0", n_delta=n_delta, n_top=n_top) + model = ARS("MlpPolicy", "Pendulum-v1", n_delta=n_delta, n_top=n_top) model.learn(total_timesteps=500) else: - model = ARS("MlpPolicy", "Pendulum-v0", n_delta=n_delta, n_top=n_top) + model = ARS("MlpPolicy", "Pendulum-v1", n_delta=n_delta, n_top=n_top) model.learn(total_timesteps=500) @pytest.mark.parametrize("model_class", [TQC, QRDQN]) def test_offpolicy_multi_env(model_class): if model_class in [TQC]: - env_id = "Pendulum-v0" + env_id = "Pendulum-v1" policy_kwargs = dict(net_arch=[64], n_critics=1) else: env_id = "CartPole-v1" diff --git a/tests/test_save_load.py b/tests/test_save_load.py index 477b7a1..2841435 100644 --- a/tests/test_save_load.py +++ b/tests/test_save_load.py @@ -447,7 +447,7 @@ def test_save_load_q_net(tmp_path, model_class, policy_str): def test_save_load_pytorch_var(tmp_path): - model = TQC("MlpPolicy", "Pendulum-v0", seed=3, policy_kwargs=dict(net_arch=[64], n_critics=1)) + model = TQC("MlpPolicy", "Pendulum-v1", seed=3, policy_kwargs=dict(net_arch=[64], n_critics=1)) model.learn(200) save_path = str(tmp_path / "tqc_pendulum") model.save(save_path) @@ -464,7 +464,7 @@ def test_save_load_pytorch_var(tmp_path): assert not th.allclose(log_ent_coef_before, log_ent_coef_after) # With a fixed entropy coef - model = TQC("MlpPolicy", "Pendulum-v0", seed=3, ent_coef=0.01, policy_kwargs=dict(net_arch=[64], n_critics=1)) + model = TQC("MlpPolicy", "Pendulum-v1", seed=3, ent_coef=0.01, policy_kwargs=dict(net_arch=[64], n_critics=1)) model.learn(200) save_path = str(tmp_path / "tqc_pendulum") model.save(save_path) diff --git a/tests/test_train_eval_mode.py b/tests/test_train_eval_mode.py index 025a143..eb08fcf 100644 --- a/tests/test_train_eval_mode.py +++ b/tests/test_train_eval_mode.py @@ -161,7 +161,7 @@ def test_qrdqn_train_with_batch_norm(): def test_tqc_train_with_batch_norm(): model = TQC( "MlpPolicy", - "Pendulum-v0", + "Pendulum-v1", policy_kwargs=dict(net_arch=[16, 16], features_extractor_class=FlattenBatchNormDropoutExtractor), learning_starts=0, tau=0, # do not copy the target @@ -203,7 +203,7 @@ def test_offpolicy_collect_rollout_batch_norm(model_class): if model_class in [QRDQN]: env_id = "CartPole-v1" else: - env_id = "Pendulum-v0" + env_id = "Pendulum-v1" clone_helper = CLONE_HELPERS[model_class] @@ -230,7 +230,7 @@ def test_offpolicy_collect_rollout_batch_norm(model_class): @pytest.mark.parametrize("model_class", [QRDQN, TQC]) -@pytest.mark.parametrize("env_id", ["Pendulum-v0", "CartPole-v1"]) +@pytest.mark.parametrize("env_id", ["Pendulum-v1", "CartPole-v1"]) def test_predict_with_dropout_batch_norm(model_class, env_id): if env_id == "CartPole-v1": if model_class in [TQC]: diff --git a/tests/test_utils.py b/tests/test_utils.py index c434dd2..9e941ea 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -42,7 +42,7 @@ def test_cg(): def test_flat_grad(): n_parameters = 12 # 3 * (2 * 2) x = th.nn.Parameter(th.ones(2, 2, requires_grad=True)) - y = (x ** 2).sum() + y = (x**2).sum() flat_grad_out = flat_grad(y, [x, x, x]) assert len(flat_grad_out.shape) == 1 # dy/dx = 2 @@ -55,10 +55,10 @@ def test_trpo_warnings(): # Only 1 step: advantage normalization will return NaN with pytest.raises(AssertionError): - TRPO("MlpPolicy", "Pendulum-v0", n_steps=1) + TRPO("MlpPolicy", "Pendulum-v1", n_steps=1) # One step not advantage normalization: ok - TRPO("MlpPolicy", "Pendulum-v0", n_steps=1, normalize_advantage=False, batch_size=1) + TRPO("MlpPolicy", "Pendulum-v1", n_steps=1, normalize_advantage=False, batch_size=1) # Truncated mini-batch with pytest.warns(UserWarning): - TRPO("MlpPolicy", "Pendulum-v0", n_steps=6, batch_size=8) + TRPO("MlpPolicy", "Pendulum-v1", n_steps=6, batch_size=8) diff --git a/tests/wrappers/test_time_feature.py b/tests/wrappers/test_time_feature.py index 1744403..6b43a5b 100644 --- a/tests/wrappers/test_time_feature.py +++ b/tests/wrappers/test_time_feature.py @@ -32,7 +32,7 @@ def check_time_feature(obs, timestep, max_timesteps): def test_time_feature(): - env = gym.make("Pendulum-v0") + env = gym.make("Pendulum-v1") env = TimeFeatureWrapper(env) check_env(env, warn=False) # Check for four episodes @@ -58,7 +58,7 @@ def test_time_feature(): check_time_feature(obs["observation"], timestep=1, max_timesteps=500) # In test mode, the time feature must be constant - env = gym.make("Pendulum-v0") + env = gym.make("Pendulum-v1") env = TimeFeatureWrapper(env, test_mode=True) obs = env.reset() check_time_feature(obs, timestep=0, max_timesteps=200)