Source code for momaland.utils.aec_wrappers

"""Various wrappers for AEC MO environments."""
from typing import Optional

import numpy as np
from gymnasium.wrappers.normalize import RunningMeanStd
from pettingzoo.utils.wrappers.base import BaseWrapper


class RecordEpisodeStatistics(BaseWrapper):
    """This wrapper will record episode statistics and print them at the end of each episode."""

    def __init__(self, env):
        """This wrapper will record episode statistics and print them at the end of each episode.

        Args:
            env (env): The environment to apply the wrapper
        """
        BaseWrapper.__init__(self, env)
        self.episode_rewards = {agent: 0 for agent in self.possible_agents}
        self.episode_lengths = {agent: 0 for agent in self.possible_agents}

    def last(self, observe: bool = True):
        """Receives the latest observation from the environment, recording episode statistics."""
        obs, rews, terminated, truncated, infos = super().last(observe=observe)
        for agent in self.env.possible_agents:
            self.episode_rewards[agent] += rews
            self.episode_lengths[agent] += 1
        if terminated or truncated:
            infos["episode"] = {
                "r": self.episode_rewards,
                "l": self.episode_lengths,
            }
        return obs, rews, terminated, truncated, infos

    def reset(self, seed: Optional[int] = None, options: Optional[dict] = None):
        """Resets the environment and the episode statistics."""
        super().reset(seed, options)
        for agent in self.env.possible_agents:
            self.episode_rewards[agent] = 0
            self.episode_lengths[agent] = 0


[docs] class LinearizeReward(BaseWrapper): """Convert MO reward vector into scalar SO reward value. `weights` represents the weights of each objective in the reward vector space for each agent. Example: >>> weights = {"agent_0": np.array([0.1, 0.9]), "agent_1": np.array([0.2, 0.8]} ... env = LinearizeReward(env, weights) """ def __init__(self, env, weights: dict): """Reward linearization class initializer. Args: env: base env to add the wrapper on. weights: a dict where keys are agents and values are vectors representing the weights of their rewards. """ self.weights = weights super().__init__(env) def last(self, observe: bool = True): """Returns a reward scalar from the reward vector.""" observation, rewards, termination, truncation, info = self.env.last(observe=observe) if self.env.agent_selection in list(self.weights.keys()): rewards = np.dot(rewards, self.weights[self.env.agent_selection]) return observation, rewards, termination, truncation, info
[docs] class NormalizeReward(BaseWrapper): r"""This wrapper will normalize immediate rewards s.t. their exponential moving average has a fixed variance. The exponential moving average will have variance :math:`(1 - \gamma)^2`. Note: The scaling depends on past trajectories and rewards will not be scaled correctly if the wrapper was newly instantiated or the policy was changed recently. Example: >>> for agent in env.possible_agents: ... for idx in range(env.reward_space(agent).shape[0]): ... env = AECWrappers.NormalizeReward(env, agent, idx) """ def __init__( self, env, agent, idx, gamma: float = 0.99, epsilon: float = 1e-8, ): """This wrapper will normalize immediate rewards s.t. their exponential moving average has a fixed variance. Args: env: The environment to apply the wrapper agent: the agent whose reward will be normalized idx: the index of the rewards that will be normalized. epsilon: A stability parameter gamma: The discount factor that is used in the exponential moving average. """ super().__init__(env) self.agent = agent self.idx = idx self.return_rms = RunningMeanStd(shape=()) self.returns = np.array([0.0]) self.gamma = gamma self.epsilon = epsilon def last(self, observe: bool = True): """Steps through the environment, normalizing the rewards returned.""" observation, rewards, terminations, truncation, infos = self.env.last(observe) if self.agent != self.env.agent_selection: return observation, rewards, terminations, truncation, infos # Extracts the objective value to normalize to_normalize = rewards[self.idx] if isinstance(rewards, np.ndarray) else rewards # array vs float self.returns = self.returns * self.gamma * (1 - terminations) + to_normalize # Defer normalization to gym implementation to_normalize = self.normalize(to_normalize) # Injecting the normalized objective value back into the reward vector # array vs float if isinstance(rewards, np.ndarray): rewards[self.idx] = to_normalize else: rewards = to_normalize return observation, rewards, terminations, truncation, infos def normalize(self, to_normalize): """Normalizes the rewards with the running mean rewards and their variance.""" self.return_rms.update(self.returns) return to_normalize / np.sqrt(self.return_rms.var + self.epsilon)