SB3:适用于骑士-弓箭手-僵尸的 PPO¶
本教程展示了如何使用 近端策略优化 (PPO) 在 骑士-弓箭手-僵尸 环境 (AEC) 中训练智能体。
我们使用 SuperSuit 创建向量化环境,利用多线程加速训练(参见 SB3 的 向量化环境文档)。
在训练和评估之后,此脚本将使用人工渲染启动一个演示游戏。训练好的模型会从磁盘保存和加载(参见 SB3 的 模型保存文档)。
如果观察空间是视觉的(在 env_kwargs
中设置 vector_state=False
),我们将使用颜色缩减、缩放和帧堆叠进行预处理,并使用 CNN 策略。
注意
此环境具有视觉(3 维)观察空间,因此我们使用 CNN 特征提取器。
注意
此环境允许智能体生成和死亡,因此需要使用 SuperSuit 的 Black Death 包装器,该包装器为死亡的智能体提供空白观察,而不是将其从环境中移除。
环境设置¶
要按照本教程进行,你需要安装下面显示的依赖项。建议使用新创建的虚拟环境以避免依赖冲突。
pettingzoo[butterfly]>=1.24.0
stable-baselines3>=2.0.0
supersuit>=3.9.0
代码¶
以下代码应能正常运行。注释旨在帮助你理解如何将 PettingZoo 与 SB3 一起使用。如果你有任何问题,请随时在 Discord 服务器 中提问。
训练和评估¶
"""Uses Stable-Baselines3 to train agents in the Knights-Archers-Zombies environment using SuperSuit vector envs.
This environment requires using SuperSuit's Black Death wrapper, to handle agent death.
For more information, see https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html
Author: Elliot (https://github.com/elliottower)
"""
from __future__ import annotations
import glob
import os
import time
import supersuit as ss
from stable_baselines3 import PPO
from stable_baselines3.ppo import CnnPolicy, MlpPolicy
from pettingzoo.butterfly import knights_archers_zombies_v10
def train(env_fn, steps: int = 10_000, seed: int | None = 0, **env_kwargs):
# Train a single model to play as each agent in an AEC environment
env = env_fn.parallel_env(**env_kwargs)
# Add black death wrapper so the number of agents stays constant
# MarkovVectorEnv does not support environments with varying numbers of active agents unless black_death is set to True
env = ss.black_death_v3(env)
# Pre-process using SuperSuit
visual_observation = not env.unwrapped.vector_state
if visual_observation:
# If the observation space is visual, reduce the color channels, resize from 512px to 84px, and apply frame stacking
env = ss.color_reduction_v0(env, mode="B")
env = ss.resize_v1(env, x_size=84, y_size=84)
env = ss.frame_stack_v1(env, 3)
env.reset(seed=seed)
print(f"Starting training on {str(env.metadata['name'])}.")
env = ss.pettingzoo_env_to_vec_env_v1(env)
env = ss.concat_vec_envs_v1(env, 8, num_cpus=1, base_class="stable_baselines3")
# Use a CNN policy if the observation space is visual
model = PPO(
CnnPolicy if visual_observation else MlpPolicy,
env,
verbose=3,
batch_size=256,
)
model.learn(total_timesteps=steps)
model.save(f"{env.unwrapped.metadata.get('name')}_{time.strftime('%Y%m%d-%H%M%S')}")
print("Model has been saved.")
print(f"Finished training on {str(env.unwrapped.metadata['name'])}.")
env.close()
def eval(env_fn, num_games: int = 100, render_mode: str | None = None, **env_kwargs):
# Evaluate a trained agent vs a random agent
env = env_fn.env(render_mode=render_mode, **env_kwargs)
# Pre-process using SuperSuit
visual_observation = not env.unwrapped.vector_state
if visual_observation:
# If the observation space is visual, reduce the color channels, resize from 512px to 84px, and apply frame stacking
env = ss.color_reduction_v0(env, mode="B")
env = ss.resize_v1(env, x_size=84, y_size=84)
env = ss.frame_stack_v1(env, 3)
print(
f"\nStarting evaluation on {str(env.metadata['name'])} (num_games={num_games}, render_mode={render_mode})"
)
try:
latest_policy = max(
glob.glob(f"{env.metadata['name']}*.zip"), key=os.path.getctime
)
except ValueError:
print("Policy not found.")
exit(0)
model = PPO.load(latest_policy)
rewards = {agent: 0 for agent in env.possible_agents}
# Note: we evaluate here using an AEC environments, to allow for easy A/B testing against random policies
# For example, we can see here that using a random agent for archer_0 results in less points than the trained agent
for i in range(num_games):
env.reset(seed=i)
env.action_space(env.possible_agents[0]).seed(i)
for agent in env.agent_iter():
obs, reward, termination, truncation, info = env.last()
for a in env.agents:
rewards[a] += env.rewards[a]
if termination or truncation:
break
else:
if agent == env.possible_agents[0]:
act = env.action_space(agent).sample()
else:
act = model.predict(obs, deterministic=True)[0]
env.step(act)
env.close()
avg_reward = sum(rewards.values()) / len(rewards.values())
avg_reward_per_agent = {
agent: rewards[agent] / num_games for agent in env.possible_agents
}
print(f"Avg reward: {avg_reward}")
print("Avg reward per agent, per game: ", avg_reward_per_agent)
print("Full rewards: ", rewards)
return avg_reward
if __name__ == "__main__":
env_fn = knights_archers_zombies_v10
# Set vector_state to false in order to use visual observations (significantly longer training time)
env_kwargs = dict(max_cycles=100, max_zombies=4, vector_state=True)
# Train a model (takes ~5 minutes on a laptop CPU)
train(env_fn, steps=81_920, seed=0, **env_kwargs)
# Evaluate 10 games (takes ~10 seconds on a laptop CPU)
eval(env_fn, num_games=10, render_mode=None, **env_kwargs)
# Watch 2 games (takes ~10 seconds on a laptop CPU)
eval(env_fn, num_games=2, render_mode="human", **env_kwargs)