sgoodfriend

A2C playing LunarLander-v2 from https://github.com/sgoodfriend/rl-algo-impls/tree/983cb75e43e51cf4ef57f177194ab9a4a1a8808b

de6a584 over 1 year ago

raw

history blame contribute delete

15 kB

	import logging
	from dataclasses import asdict, dataclass
	from time import perf_counter
	from typing import List, NamedTuple, Optional, TypeVar

	import numpy as np
	import torch
	import torch.nn as nn
	from torch.optim import Adam
	from torch.utils.tensorboard.writer import SummaryWriter

	from rl_algo_impls.shared.algorithm import Algorithm
	from rl_algo_impls.shared.callbacks import Callback
	from rl_algo_impls.shared.gae import compute_advantages
	from rl_algo_impls.shared.policy.actor_critic import ActorCritic
	from rl_algo_impls.shared.schedule import (
	constant_schedule,
	linear_schedule,
	schedule,
	update_learning_rate,
	)
	from rl_algo_impls.shared.stats import log_scalars
	from rl_algo_impls.wrappers.vectorable_wrapper import (
	VecEnv,
	single_action_space,
	single_observation_space,
	)


	class TrainStepStats(NamedTuple):
	loss: float
	pi_loss: float
	v_loss: float
	entropy_loss: float
	approx_kl: float
	clipped_frac: float
	val_clipped_frac: float


	@dataclass
	class TrainStats:
	loss: float
	pi_loss: float
	v_loss: float
	entropy_loss: float
	approx_kl: float
	clipped_frac: float
	val_clipped_frac: float
	explained_var: float

	def __init__(self, step_stats: List[TrainStepStats], explained_var: float) -> None:
	self.loss = np.mean([s.loss for s in step_stats]).item()
	self.pi_loss = np.mean([s.pi_loss for s in step_stats]).item()
	self.v_loss = np.mean([s.v_loss for s in step_stats]).item()
	self.entropy_loss = np.mean([s.entropy_loss for s in step_stats]).item()
	self.approx_kl = np.mean([s.approx_kl for s in step_stats]).item()
	self.clipped_frac = np.mean([s.clipped_frac for s in step_stats]).item()
	self.val_clipped_frac = np.mean([s.val_clipped_frac for s in step_stats]).item()
	self.explained_var = explained_var

	def write_to_tensorboard(self, tb_writer: SummaryWriter, global_step: int) -> None:
	for name, value in asdict(self).items():
	tb_writer.add_scalar(f"losses/{name}", value, global_step=global_step)

	def __repr__(self) -> str:
	return " \| ".join(
	[
	f"Loss: {round(self.loss, 2)}",
	f"Pi L: {round(self.pi_loss, 2)}",
	f"V L: {round(self.v_loss, 2)}",
	f"E L: {round(self.entropy_loss, 2)}",
	f"Apx KL Div: {round(self.approx_kl, 2)}",
	f"Clip Frac: {round(self.clipped_frac, 2)}",
	f"Val Clip Frac: {round(self.val_clipped_frac, 2)}",
	]
	)


	PPOSelf = TypeVar("PPOSelf", bound="PPO")


	class PPO(Algorithm):
	def __init__(
	self,
	policy: ActorCritic,
	env: VecEnv,
	device: torch.device,
	tb_writer: SummaryWriter,
	learning_rate: float = 3e-4,
	learning_rate_decay: str = "none",
	n_steps: int = 2048,
	batch_size: int = 64,
	n_epochs: int = 10,
	gamma: float = 0.99,
	gae_lambda: float = 0.95,
	clip_range: float = 0.2,
	clip_range_decay: str = "none",
	clip_range_vf: Optional[float] = None,
	clip_range_vf_decay: str = "none",
	normalize_advantage: bool = True,
	ent_coef: float = 0.0,
	ent_coef_decay: str = "none",
	vf_coef: float = 0.5,
	ppo2_vf_coef_halving: bool = False,
	max_grad_norm: float = 0.5,
	sde_sample_freq: int = -1,
	update_advantage_between_epochs: bool = True,
	update_returns_between_epochs: bool = False,
	gamma_end: Optional[float] = None,
	) -> None:
	super().__init__(policy, env, device, tb_writer)
	self.policy = policy
	self.get_action_mask = getattr(env, "get_action_mask", None)

	self.gamma_schedule = (
	linear_schedule(gamma, gamma_end)
	if gamma_end is not None
	else constant_schedule(gamma)
	)
	self.gae_lambda = gae_lambda
	self.optimizer = Adam(self.policy.parameters(), lr=learning_rate, eps=1e-7)
	self.lr_schedule = schedule(learning_rate_decay, learning_rate)
	self.max_grad_norm = max_grad_norm
	self.clip_range_schedule = schedule(clip_range_decay, clip_range)
	self.clip_range_vf_schedule = None
	if clip_range_vf:
	self.clip_range_vf_schedule = schedule(clip_range_vf_decay, clip_range_vf)

	if normalize_advantage:
	assert (
	env.num_envs * n_steps > 1 and batch_size > 1
	), f"Each minibatch must be larger than 1 to support normalization"
	self.normalize_advantage = normalize_advantage

	self.ent_coef_schedule = schedule(ent_coef_decay, ent_coef)
	self.vf_coef = vf_coef
	self.ppo2_vf_coef_halving = ppo2_vf_coef_halving

	self.n_steps = n_steps
	self.batch_size = batch_size
	self.n_epochs = n_epochs
	self.sde_sample_freq = sde_sample_freq

	self.update_advantage_between_epochs = update_advantage_between_epochs
	self.update_returns_between_epochs = update_returns_between_epochs

	def learn(
	self: PPOSelf,
	train_timesteps: int,
	callbacks: Optional[List[Callback]] = None,
	total_timesteps: Optional[int] = None,
	start_timesteps: int = 0,
	) -> PPOSelf:
	if total_timesteps is None:
	total_timesteps = train_timesteps
	assert start_timesteps + train_timesteps <= total_timesteps

	epoch_dim = (self.n_steps, self.env.num_envs)
	step_dim = (self.env.num_envs,)
	obs_space = single_observation_space(self.env)
	act_space = single_action_space(self.env)
	act_shape = self.policy.action_shape

	next_obs = self.env.reset()
	next_action_masks = self.get_action_mask() if self.get_action_mask else None
	next_episode_starts = np.full(step_dim, True, dtype=np.bool_)

	obs = np.zeros(epoch_dim + obs_space.shape, dtype=obs_space.dtype) # type: ignore
	actions = np.zeros(epoch_dim + act_shape, dtype=act_space.dtype) # type: ignore
	rewards = np.zeros(epoch_dim, dtype=np.float32)
	episode_starts = np.zeros(epoch_dim, dtype=np.bool_)
	values = np.zeros(epoch_dim, dtype=np.float32)
	logprobs = np.zeros(epoch_dim, dtype=np.float32)
	action_masks = (
	np.zeros(
	(self.n_steps,) + next_action_masks.shape, dtype=next_action_masks.dtype
	)
	if next_action_masks is not None
	else None
	)

	timesteps_elapsed = start_timesteps
	while timesteps_elapsed < start_timesteps + train_timesteps:
	start_time = perf_counter()

	progress = timesteps_elapsed / total_timesteps
	ent_coef = self.ent_coef_schedule(progress)
	learning_rate = self.lr_schedule(progress)
	update_learning_rate(self.optimizer, learning_rate)
	pi_clip = self.clip_range_schedule(progress)
	gamma = self.gamma_schedule(progress)
	chart_scalars = {
	"learning_rate": self.optimizer.param_groups[0]["lr"],
	"ent_coef": ent_coef,
	"pi_clip": pi_clip,
	"gamma": gamma,
	}
	if self.clip_range_vf_schedule:
	v_clip = self.clip_range_vf_schedule(progress)
	chart_scalars["v_clip"] = v_clip
	else:
	v_clip = None
	log_scalars(self.tb_writer, "charts", chart_scalars, timesteps_elapsed)

	self.policy.eval()
	self.policy.reset_noise()
	for s in range(self.n_steps):
	timesteps_elapsed += self.env.num_envs
	if self.sde_sample_freq > 0 and s > 0 and s % self.sde_sample_freq == 0:
	self.policy.reset_noise()

	obs[s] = next_obs
	episode_starts[s] = next_episode_starts
	if action_masks is not None:
	action_masks[s] = next_action_masks

	(
	actions[s],
	values[s],
	logprobs[s],
	clamped_action,
	) = self.policy.step(next_obs, action_masks=next_action_masks)
	next_obs, rewards[s], next_episode_starts, _ = self.env.step(
	clamped_action
	)
	next_action_masks = (
	self.get_action_mask() if self.get_action_mask else None
	)

	self.policy.train()

	b_obs = torch.tensor(obs.reshape((-1,) + obs_space.shape)).to(self.device) # type: ignore
	b_actions = torch.tensor(actions.reshape((-1,) + act_shape)).to( # type: ignore
	self.device
	)
	b_logprobs = torch.tensor(logprobs.reshape(-1)).to(self.device)
	b_action_masks = (
	torch.tensor(action_masks.reshape((-1,) + next_action_masks.shape[1:])).to( # type: ignore
	self.device
	)
	if action_masks is not None
	else None
	)

	y_pred = values.reshape(-1)
	b_values = torch.tensor(y_pred).to(self.device)

	step_stats = []
	# Define variables that will definitely be set through the first epoch
	advantages: np.ndarray = None # type: ignore
	b_advantages: torch.Tensor = None # type: ignore
	y_true: np.ndarray = None # type: ignore
	b_returns: torch.Tensor = None # type: ignore
	for e in range(self.n_epochs):
	if e == 0 or self.update_advantage_between_epochs:
	advantages = compute_advantages(
	rewards,
	values,
	episode_starts,
	next_episode_starts,
	next_obs,
	self.policy,
	gamma,
	self.gae_lambda,
	)
	b_advantages = torch.tensor(advantages.reshape(-1)).to(self.device)
	if e == 0 or self.update_returns_between_epochs:
	returns = advantages + values
	y_true = returns.reshape(-1)
	b_returns = torch.tensor(y_true).to(self.device)

	b_idxs = torch.randperm(len(b_obs))
	# Only record last epoch's stats
	step_stats.clear()
	for i in range(0, len(b_obs), self.batch_size):
	self.policy.reset_noise(self.batch_size)

	mb_idxs = b_idxs[i : i + self.batch_size]

	mb_obs = b_obs[mb_idxs]
	mb_actions = b_actions[mb_idxs]
	mb_values = b_values[mb_idxs]
	mb_logprobs = b_logprobs[mb_idxs]
	mb_action_masks = (
	b_action_masks[mb_idxs] if b_action_masks is not None else None
	)

	mb_adv = b_advantages[mb_idxs]
	if self.normalize_advantage:
	mb_adv = (mb_adv - mb_adv.mean()) / (mb_adv.std() + 1e-8)
	mb_returns = b_returns[mb_idxs]

	new_logprobs, entropy, new_values = self.policy(
	mb_obs, mb_actions, action_masks=mb_action_masks
	)

	logratio = new_logprobs - mb_logprobs
	ratio = torch.exp(logratio)
	clipped_ratio = torch.clamp(ratio, min=1 - pi_clip, max=1 + pi_clip)
	pi_loss = torch.max(-ratio * mb_adv, -clipped_ratio * mb_adv).mean()

	v_loss_unclipped = (new_values - mb_returns) ** 2
	if v_clip:
	v_loss_clipped = (
	mb_values
	+ torch.clamp(new_values - mb_values, -v_clip, v_clip)
	- mb_returns
	) ** 2
	v_loss = torch.max(v_loss_unclipped, v_loss_clipped).mean()
	else:
	v_loss = v_loss_unclipped.mean()

	if self.ppo2_vf_coef_halving:
	v_loss *= 0.5

	entropy_loss = -entropy.mean()

	loss = pi_loss + ent_coef * entropy_loss + self.vf_coef * v_loss

	self.optimizer.zero_grad()
	loss.backward()
	nn.utils.clip_grad_norm_(
	self.policy.parameters(), self.max_grad_norm
	)
	self.optimizer.step()

	with torch.no_grad():
	approx_kl = ((ratio - 1) - logratio).mean().cpu().numpy().item()
	clipped_frac = (
	((ratio - 1).abs() > pi_clip)
	.float()
	.mean()
	.cpu()
	.numpy()
	.item()
	)
	val_clipped_frac = (
	((new_values - mb_values).abs() > v_clip)
	.float()
	.mean()
	.cpu()
	.numpy()
	.item()
	if v_clip
	else 0
	)

	step_stats.append(
	TrainStepStats(
	loss.item(),
	pi_loss.item(),
	v_loss.item(),
	entropy_loss.item(),
	approx_kl,
	clipped_frac,
	val_clipped_frac,
	)
	)

	var_y = np.var(y_true).item()
	explained_var = (
	np.nan if var_y == 0 else 1 - np.var(y_true - y_pred).item() / var_y
	)
	TrainStats(step_stats, explained_var).write_to_tensorboard(
	self.tb_writer, timesteps_elapsed
	)

	end_time = perf_counter()
	rollout_steps = self.n_steps * self.env.num_envs
	self.tb_writer.add_scalar(
	"train/steps_per_second",
	rollout_steps / (end_time - start_time),
	timesteps_elapsed,
	)

	if callbacks:
	if not all(
	c.on_step(timesteps_elapsed=rollout_steps) for c in callbacks
	):
	logging.info(
	f"Callback terminated training at {timesteps_elapsed} timesteps"
	)
	break

	return self