环境创建¶
本文档概述了如何创建新环境,以及 PettingZoo 中包含的用于创建新环境的相关实用包装器、工具类和测试。
我们将逐步介绍如何创建一个简单的石头剪刀布环境,并提供 AEC 和 并行 环境的示例代码。
请参阅我们的 自定义环境教程,了解创建自定义环境的完整步骤,包括复杂的环境逻辑和非法动作遮罩。
自定义环境示例¶
这是 PettingZoo 石头剪刀布环境的一个经过仔细注释的版本。
import functools
import gymnasium
import numpy as np
from gymnasium.spaces import Discrete
from gymnasium.utils import seeding
from pettingzoo import AECEnv
from pettingzoo.utils import AgentSelector, wrappers
ROCK = 0
PAPER = 1
SCISSORS = 2
NONE = 3
MOVES = ["ROCK", "PAPER", "SCISSORS", "None"]
NUM_ITERS = 100
REWARD_MAP = {
(ROCK, ROCK): (0, 0),
(ROCK, PAPER): (-1, 1),
(ROCK, SCISSORS): (1, -1),
(PAPER, ROCK): (1, -1),
(PAPER, PAPER): (0, 0),
(PAPER, SCISSORS): (-1, 1),
(SCISSORS, ROCK): (-1, 1),
(SCISSORS, PAPER): (1, -1),
(SCISSORS, SCISSORS): (0, 0),
}
def env(render_mode=None):
"""
The env function often wraps the environment in wrappers by default.
You can find full documentation for these methods
elsewhere in the developer documentation.
"""
internal_render_mode = render_mode if render_mode != "ansi" else "human"
env = raw_env(render_mode=internal_render_mode)
# This wrapper is only for environments which print results to the terminal
if render_mode == "ansi":
env = wrappers.CaptureStdoutWrapper(env)
# this wrapper helps error handling for discrete action spaces
env = wrappers.AssertOutOfBoundsWrapper(env)
# Provides a wide vareity of helpful user errors
# Strongly recommended
env = wrappers.OrderEnforcingWrapper(env)
return env
class raw_env(AECEnv):
"""
The metadata holds environment constants. From gymnasium, we inherit the "render_modes",
metadata which specifies which modes can be put into the render() method.
At least human mode should be supported.
The "name" metadata allows the environment to be pretty printed.
"""
metadata = {"render_modes": ["human"], "name": "rps_v2"}
def __init__(self, render_mode=None):
"""
The init method takes in environment arguments and
should define the following attributes:
- possible_agents
- render_mode
Note: as of v1.18.1, the action_spaces and observation_spaces attributes are deprecated.
Spaces should be defined in the action_space() and observation_space() methods.
If these methods are not overridden, spaces will be inferred from self.observation_spaces/action_spaces, raising a warning.
These attributes should not be changed after initialization.
"""
self.possible_agents = ["player_" + str(r) for r in range(2)]
# optional: a mapping between agent name and ID
self.agent_name_mapping = dict(
zip(self.possible_agents, list(range(len(self.possible_agents))))
)
# optional: we can define the observation and action spaces here as attributes to be used in their corresponding methods
self._action_spaces = {agent: Discrete(3) for agent in self.possible_agents}
self._observation_spaces = {
agent: Discrete(4) for agent in self.possible_agents
}
self.render_mode = render_mode
# Observation space should be defined here.
# lru_cache allows observation and action spaces to be memoized, reducing clock cycles required to get each agent's space.
# If your spaces change over time, remove this line (disable caching).
@functools.lru_cache(maxsize=None)
def observation_space(self, agent):
# gymnasium spaces are defined and documented here: https://gymnasium.org.cn/api/spaces/
return Discrete(4)
# Action space should be defined here.
# If your spaces change over time, remove this line (disable caching).
@functools.lru_cache(maxsize=None)
def action_space(self, agent):
# We can seed the action space to make the environment deterministic.
return Discrete(3, seed=self.np_random_seed)
def render(self):
"""
Renders the environment. In human mode, it can print to terminal, open
up a graphical window, or open up some other display that a human can see and understand.
"""
if self.render_mode is None:
gymnasium.logger.warn(
"You are calling render method without specifying any render mode."
)
return
if len(self.agents) == 2:
string = "Current state: Agent1: {} , Agent2: {}".format(
MOVES[self.state[self.agents[0]]], MOVES[self.state[self.agents[1]]]
)
else:
string = "Game over"
print(string)
def observe(self, agent):
"""
Observe should return the observation of the specified agent. This function
should return a sane observation (though not necessarily the most up to date possible)
at any time after reset() is called.
"""
# observation of one agent is the previous state of the other
return np.array(self.observations[agent])
def close(self):
"""
Close should release any graphical displays, subprocesses, network connections
or any other environment data which should not be kept around after the
user is no longer using the environment.
"""
pass
def reset(self, seed=None, options=None):
"""
Reset needs to initialize the following attributes
- agents
- rewards
- _cumulative_rewards
- terminations
- truncations
- infos
- agent_selection
And must set up the environment so that render(), step(), and observe()
can be called without issues.
Here it sets up the state dictionary which is used by step() and the observations dictionary which is used by step() and observe()
"""
# Unlike gymnasium's Env, the environment is responsible for setting the random seed explicitly.
if seed is not None:
self.np_random, self.np_random_seed = seeding.np_random(seed)
self.agents = self.possible_agents[:]
self.rewards = {agent: 0 for agent in self.agents}
self._cumulative_rewards = {agent: 0 for agent in self.agents}
self.terminations = {agent: False for agent in self.agents}
self.truncations = {agent: False for agent in self.agents}
self.infos = {agent: {} for agent in self.agents}
self.state = {agent: NONE for agent in self.agents}
self.observations = {agent: NONE for agent in self.agents}
self.num_moves = 0
"""
Our AgentSelector utility allows easy cyclic stepping through the agents list.
"""
self._agent_selector = AgentSelector(self.agents)
self.agent_selection = self._agent_selector.next()
def step(self, action):
"""
step(action) takes in an action for the current agent (specified by
agent_selection) and needs to update
- rewards
- _cumulative_rewards (accumulating the rewards)
- terminations
- truncations
- infos
- agent_selection (to the next agent)
And any internal state used by observe() or render()
"""
if (
self.terminations[self.agent_selection]
or self.truncations[self.agent_selection]
):
# handles stepping an agent which is already dead
# accepts a None action for the one agent, and moves the agent_selection to
# the next dead agent, or if there are no more dead agents, to the next live agent
self._was_dead_step(action)
return
agent = self.agent_selection
# the agent which stepped last had its _cumulative_rewards accounted for
# (because it was returned by last()), so the _cumulative_rewards for this
# agent should start again at 0
self._cumulative_rewards[agent] = 0
# stores action of current agent
self.state[self.agent_selection] = action
# collect reward if it is the last agent to act
if self._agent_selector.is_last():
# rewards for all agents are placed in the .rewards dictionary
self.rewards[self.agents[0]], self.rewards[self.agents[1]] = REWARD_MAP[
(self.state[self.agents[0]], self.state[self.agents[1]])
]
self.num_moves += 1
# The truncations dictionary must be updated for all players.
self.truncations = {
agent: self.num_moves >= NUM_ITERS for agent in self.agents
}
# observe the current state
for i in self.agents:
self.observations[i] = self.state[
self.agents[1 - self.agent_name_mapping[i]]
]
else:
# necessary so that observe() returns a reasonable observation at all times.
self.state[self.agents[1 - self.agent_name_mapping[agent]]] = NONE
# no rewards are allocated until both players give an action
self._clear_rewards()
# selects the next agent.
self.agent_selection = self._agent_selector.next()
# Adds .rewards to ._cumulative_rewards
self._accumulate_rewards()
if self.render_mode == "human":
self.render()
要与你的自定义 AEC 环境交互,请使用以下代码
from . import aec_rps
env = aec_rps.env(render_mode="human")
env.reset(seed=42)
for agent in env.agent_iter():
observation, reward, termination, truncation, info = env.last()
if termination or truncation:
action = None
else:
# this is where you would insert your policy
action = env.action_space(agent).sample()
env.step(action)
env.close()
自定义并行环境示例¶
import functools
import gymnasium
import numpy as np
from gymnasium.spaces import Discrete
from gymnasium.utils import seeding
from pettingzoo import ParallelEnv
from pettingzoo.utils import parallel_to_aec, wrappers
ROCK = 0
PAPER = 1
SCISSORS = 2
NO_MOVE = 3
MOVES = ["ROCK", "PAPER", "SCISSORS", "None"]
NUM_ITERS = 100
REWARD_MAP = {
(ROCK, ROCK): (0, 0),
(ROCK, PAPER): (-1, 1),
(ROCK, SCISSORS): (1, -1),
(PAPER, ROCK): (1, -1),
(PAPER, PAPER): (0, 0),
(PAPER, SCISSORS): (-1, 1),
(SCISSORS, ROCK): (-1, 1),
(SCISSORS, PAPER): (1, -1),
(SCISSORS, SCISSORS): (0, 0),
}
def env(render_mode=None):
"""
The env function often wraps the environment in wrappers by default.
You can find full documentation for these methods
elsewhere in the developer documentation.
"""
internal_render_mode = render_mode if render_mode != "ansi" else "human"
env = raw_env(render_mode=internal_render_mode)
# This wrapper is only for environments which print results to the terminal
if render_mode == "ansi":
env = wrappers.CaptureStdoutWrapper(env)
# this wrapper helps error handling for discrete action spaces
env = wrappers.AssertOutOfBoundsWrapper(env)
# Provides a wide vareity of helpful user errors
# Strongly recommended
env = wrappers.OrderEnforcingWrapper(env)
return env
def raw_env(render_mode=None):
"""
To support the AEC API, the raw_env() function just uses the from_parallel
function to convert from a ParallelEnv to an AEC env
"""
env = parallel_env(render_mode=render_mode)
env = parallel_to_aec(env)
return env
class parallel_env(ParallelEnv):
metadata = {"render_modes": ["human"], "name": "rps_v2"}
def __init__(self, render_mode=None):
"""
The init method takes in environment arguments and should define the following attributes:
- possible_agents
- render_mode
Note: as of v1.18.1, the action_spaces and observation_spaces attributes are deprecated.
Spaces should be defined in the action_space() and observation_space() methods.
If these methods are not overridden, spaces will be inferred from self.observation_spaces/action_spaces, raising a warning.
These attributes should not be changed after initialization.
"""
self.possible_agents = ["player_" + str(r) for r in range(2)]
# optional: a mapping between agent name and ID
self.agent_name_mapping = dict(
zip(self.possible_agents, list(range(len(self.possible_agents))))
)
self.render_mode = render_mode
# Observation space should be defined here.
# lru_cache allows observation and action spaces to be memoized, reducing clock cycles required to get each agent's space.
# If your spaces change over time, remove this line (disable caching).
@functools.lru_cache(maxsize=None)
def observation_space(self, agent):
# gymnasium spaces are defined and documented here: https://gymnasium.org.cn/api/spaces/
# Discrete(4) means an integer in range(0, 4)
return Discrete(4)
# Action space should be defined here.
# If your spaces change over time, remove this line (disable caching).
@functools.lru_cache(maxsize=None)
def action_space(self, agent):
return Discrete(3, seed=self.np_random_seed)
def render(self):
"""
Renders the environment. In human mode, it can print to terminal, open
up a graphical window, or open up some other display that a human can see and understand.
"""
if self.render_mode is None:
gymnasium.logger.warn(
"You are calling render method without specifying any render mode."
)
return
if len(self.agents) == 2:
string = "Current state: Agent1: {} , Agent2: {}".format(
MOVES[self.state[self.agents[0]]], MOVES[self.state[self.agents[1]]]
)
else:
string = "Game over"
print(string)
def close(self):
"""
Close should release any graphical displays, subprocesses, network connections
or any other environment data which should not be kept around after the
user is no longer using the environment.
"""
pass
def reset(self, seed=None, options=None):
"""
Reset needs to initialize the `agents` attribute and must set up the
environment so that render(), and step() can be called without issues.
Here it initializes the `num_moves` variable which counts the number of
hands that are played.
Returns the observations for each agent
"""
if seed is not None:
self.np_random, self.np_random_seed = seeding.np_random(seed)
self.agents = self.possible_agents[:]
self.num_moves = 0
# the observations should be numpy arrays even if there is only one value
observations = {agent: np.array(NO_MOVE) for agent in self.agents}
infos = {agent: {} for agent in self.agents}
self.state = observations
return observations, infos
def step(self, actions):
"""
step(action) takes in an action for each agent and should return the
- observations
- rewards
- terminations
- truncations
- infos
dicts where each dict looks like {agent_1: item_1, agent_2: item_2}
"""
# If a user passes in actions with no agents, then just return empty observations, etc.
if not actions:
self.agents = []
return {}, {}, {}, {}, {}
# rewards for all agents are placed in the rewards dictionary to be returned
rewards = {}
rewards[self.agents[0]], rewards[self.agents[1]] = REWARD_MAP[
(actions[self.agents[0]], actions[self.agents[1]])
]
terminations = {agent: False for agent in self.agents}
self.num_moves += 1
env_truncation = self.num_moves >= NUM_ITERS
truncations = {agent: env_truncation for agent in self.agents}
# Current observation is just the other player's most recent action
# This is converted to a numpy value of type int to match the type
# that we declared in observation_space()
observations = {
self.agents[i]: np.array(actions[self.agents[1 - i]], dtype=np.int64)
for i in range(len(self.agents))
}
self.state = observations
# typically there won't be any information in the infos, but there must
# still be an entry for each agent
infos = {agent: {} for agent in self.agents}
if env_truncation:
self.agents = []
if self.render_mode == "human":
self.render()
return observations, rewards, terminations, truncations, infos
要与你的自定义并行环境交互,请使用以下代码
from . import parallel_rps
env = parallel_rps.parallel_env(render_mode="human")
observations, infos = env.reset(seed=42)
while env.agents:
# this is where you would insert your policy
actions = {agent: env.action_space(agent).sample() for agent in env.agents}
observations, rewards, terminations, truncations, infos = env.step(actions)
env.close()
使用包装器¶
包装器是一种环境转换,它接受一个环境作为输入,并输出一个与输入环境相似但应用了一些转换或验证的新环境。PettingZoo 提供了 用于在 AEC API 和 并行 API 之间转换环境的包装器,以及一组简单的 工具包装器,它们提供输入验证和其他便捷的可重用逻辑。PettingZoo 还通过配套包 SuperSuit(pip install supersuit
)提供了 包装器。
from pettingzoo.butterfly import pistonball_v6
from pettingzoo.utils import ClipOutOfBoundsWrapper
env = pistonball_v6.env()
wrapped_env = ClipOutOfBoundsWrapper(env)
# Wrapped environments must be reset before use
wrapped_env.reset()
开发者工具类¶
utils 目录包含一些对调试环境有用的函数。这些函数在 API 文档中进行了说明。
utils 目录还包含一些仅对开发新环境有用的类。这些类在下面进行了说明。
智能体选择器¶
AgentSelector
类按循环遍历智能体
它可以按如下方式用于循环遍历智能体列表
from pettingzoo.utils import AgentSelector
agents = ["agent_1", "agent_2", "agent_3"]
selector = AgentSelector(agents)
agent_selection = selector.reset()
# agent_selection will be "agent_1"
for i in range(100):
agent_selection = selector.next()
# will select "agent_2", "agent_3", "agent_1", "agent_2", "agent_3", ..."
已弃用模块¶
DeprecatedModule 在 PettingZoo 中用于帮助引导用户弃用旧的过时环境版本,转向新的环境版本。如果你希望创建类似的版本控制系统,这可能会有所帮助。
例如,当用户尝试导入 knights_archers_zombies_v0
环境时,他们会导入以下变量(定义在 pettingzoo/butterfly/__init__.py
中)
from pettingzoo.utils.deprecated_module import DeprecatedModule
knights_archers_zombies_v0 = DeprecatedModule("knights_archers_zombies", "v0", "v10")
此声明告诉用户 knights_archers_zombies_v0
已弃用,应改用 knights_archers_zombies_v10
。特别是,它会给出以下错误
from pettingzoo.butterfly import knights_archers_zombies_v0
knights_archers_zombies_v0.env()
# pettingzoo.utils.deprecated_module.DeprecatedEnv: knights_archers_zombies_v0 is now deprecated, use knights_archers_zombies_v10 instead