import logging from dataclasses import dataclass from functools import partial from typing import Protocol import matplotlib.pyplot as plt import numpy as np import scipy import torch import torch.nn.functional as F from torch import Tensor, nn from tqdm import trange from .wn import WN logger = logging.getLogger(__name__) class VelocityField(Protocol): def __call__(self, *, t: Tensor, ψt: Tensor, dt: Tensor) -> Tensor: ... class Solver: def __init__( self, method="midpoint", nfe=32, viz_name="solver", viz_every=100, mel_fn=None, time_mapping_divisor=4, verbose=False, ): self.configurate_(nfe=nfe, method=method) self.verbose = verbose self.viz_every = viz_every self.viz_name = viz_name self._camera = None self._mel_fn = mel_fn self._time_mapping = partial(self.exponential_decay_mapping, n=time_mapping_divisor) def configurate_(self, nfe=None, method=None): if nfe is None: nfe = self.nfe if method is None: method = self.method if nfe == 1 and method in ("midpoint", "rk4"): logger.warning(f"1 NFE is not supported for {method}, using euler method instead.") method = "euler" self.nfe = nfe self.method = method @property def time_mapping(self): return self._time_mapping @staticmethod def exponential_decay_mapping(t, n=4): """ Args: n: target step """ def h(t, a): return (a**t - 1) / (a - 1) # Solve h(1/n) = 0.5 a = float(scipy.optimize.fsolve(lambda a: h(1 / n, a) - 0.5, x0=0)) t = h(t, a=a) return t @torch.no_grad() def _maybe_camera_snap(self, *, ψt, t): camera = self._camera if camera is not None: if ψt.shape[1] == 1: # Waveform, b 1 t, plot every 100 samples plt.subplot(211) plt.plot(ψt.detach().cpu().numpy()[0, 0, ::100], color="blue") if self._mel_fn is not None: plt.subplot(212) mel = self._mel_fn(ψt.detach().cpu().numpy()[0, 0]) plt.imshow(mel, origin="lower", interpolation="none") elif ψt.shape[1] == 2: # Complex plt.subplot(121) plt.imshow( ψt.detach().cpu().numpy()[0, 0], origin="lower", interpolation="none", ) plt.subplot(122) plt.imshow( ψt.detach().cpu().numpy()[0, 1], origin="lower", interpolation="none", ) else: # Spectrogram, b c t plt.imshow(ψt.detach().cpu().numpy()[0], origin="lower", interpolation="none") ax = plt.gca() ax.text(0.5, 1.01, f"t={t:.2f}", transform=ax.transAxes, ha="center") camera.snap() @staticmethod def _euler_step(t, ψt, dt, f: VelocityField): return ψt + dt * f(t=t, ψt=ψt, dt=dt) @staticmethod def _midpoint_step(t, ψt, dt, f: VelocityField): return ψt + dt * f(t=t + dt / 2, ψt=ψt + dt * f(t=t, ψt=ψt, dt=dt) / 2, dt=dt) @staticmethod def _rk4_step(t, ψt, dt, f: VelocityField): k1 = f(t=t, ψt=ψt, dt=dt) k2 = f(t=t + dt / 2, ψt=ψt + dt * k1 / 2, dt=dt) k3 = f(t=t + dt / 2, ψt=ψt + dt * k2 / 2, dt=dt) k4 = f(t=t + dt, ψt=ψt + dt * k3, dt=dt) return ψt + dt * (k1 + 2 * k2 + 2 * k3 + k4) / 6 @property def _step(self): if self.method == "euler": return self._euler_step elif self.method == "midpoint": return self._midpoint_step elif self.method == "rk4": return self._rk4_step else: raise ValueError(f"Unknown method: {self.method}") def get_running_train_loop(self): try: # Lazy import from ...utils.train_loop import TrainLoop return TrainLoop.get_running_loop() except ImportError: return None @property def visualizing(self): loop = self.get_running_train_loop() if loop is None: return out_path = loop.make_current_step_viz_path(self.viz_name, ".gif") return loop.global_step % self.viz_every == 0 and not out_path.exists() def _reset_camera(self): try: from celluloid import Camera self._camera = Camera(plt.figure()) except: pass def _maybe_dump_camera(self): camera = self._camera loop = self.get_running_train_loop() if camera is not None and loop is not None: animation = camera.animate() out_path = loop.make_current_step_viz_path(self.viz_name, ".gif") out_path.parent.mkdir(exist_ok=True, parents=True) animation.save(out_path, writer="pillow", fps=4) plt.close() self._camera = None @property def n_steps(self): n = self.nfe if self.method == "euler": pass elif self.method == "midpoint": n //= 2 elif self.method == "rk4": n //= 4 else: raise ValueError(f"Unknown method: {self.method}") return n def solve(self, f: VelocityField, ψ0: Tensor, t0=0.0, t1=1.0): ts = self._time_mapping(np.linspace(t0, t1, self.n_steps + 1)) if self.visualizing: self._reset_camera() if self.verbose: steps = trange(self.n_steps, desc="CFM inference") else: steps = range(self.n_steps) ψt = ψ0 for i in steps: dt = ts[i + 1] - ts[i] t = ts[i] self._maybe_camera_snap(ψt=ψt, t=t) ψt = self._step(t=t, ψt=ψt, dt=dt, f=f) self._maybe_camera_snap(ψt=ψt, t=ts[-1]) ψ1 = ψt del ψt self._maybe_dump_camera() return ψ1 def __call__(self, f: VelocityField, ψ0: Tensor, t0=0.0, t1=1.0): return self.solve(f=f, ψ0=ψ0, t0=t0, t1=t1) class SinusodialTimeEmbedding(nn.Module): def __init__(self, d_embed): super().__init__() self.d_embed = d_embed assert d_embed % 2 == 0 def forward(self, t): t = t.unsqueeze(-1) # ... 1 p = torch.linspace(0, 4, self.d_embed // 2).to(t) while p.dim() < t.dim(): p = p.unsqueeze(0) # ... d/2 sin = torch.sin(t * 10**p) cos = torch.cos(t * 10**p) return torch.cat([sin, cos], dim=-1) @dataclass(eq=False) class CFM(nn.Module): """ This mixin is for general diffusion models. ψ0 stands for the gaussian noise, and ψ1 is the data point. Here we follow the CFM style: The generation process (reverse process) is from t=0 to t=1. The forward process is from t=1 to t=0. """ cond_dim: int output_dim: int time_emb_dim: int = 128 viz_name: str = "cfm" solver_nfe: int = 32 solver_method: str = "midpoint" time_mapping_divisor: int = 4 def __post_init__(self): super().__init__() self.solver = Solver( viz_name=self.viz_name, viz_every=1, nfe=self.solver_nfe, method=self.solver_method, time_mapping_divisor=self.time_mapping_divisor, ) self.emb = SinusodialTimeEmbedding(self.time_emb_dim) self.net = WN( input_dim=self.output_dim, output_dim=self.output_dim, local_dim=self.cond_dim, global_dim=self.time_emb_dim, ) def _perturb(self, ψ1: Tensor, t: Tensor | None = None): """ Perturb ψ1 to ψt. """ raise NotImplementedError def _sample_ψ0(self, x: Tensor): """ Args: x: (b c t), which implies the shape of ψ0 """ shape = list(x.shape) shape[1] = self.output_dim if self.training: g = None else: g = torch.Generator(device=x.device) g.manual_seed(0) # deterministic sampling during eval ψ0 = torch.randn(shape, device=x.device, dtype=x.dtype, generator=g) return ψ0 @property def sigma(self): return 1e-4 def _to_ψt(self, *, ψ1: Tensor, ψ0: Tensor, t: Tensor): """ Eq (22) """ while t.dim() < ψ1.dim(): t = t.unsqueeze(-1) μ = t * ψ1 + (1 - t) * ψ0 return μ + torch.randn_like(μ) * self.sigma def _to_u(self, *, ψ1, ψ0: Tensor): """ Eq (21) """ return ψ1 - ψ0 def _to_v(self, *, ψt, x, t: float | Tensor): """ Args: ψt: (b c t) x: (b c t) t: (b) Returns: v: (b c t) """ if isinstance(t, (float, int)): t = torch.full(ψt.shape[:1], t).to(ψt) t = t.clamp(0, 1) # [0, 1) g = self.emb(t) # (b d) v = self.net(ψt, l=x, g=g) return v def compute_losses(self, x, y, ψ0) -> dict: """ Args: x: (b c t) y: (b c t) Returns: losses: dict """ t = torch.rand(len(x), device=x.device, dtype=x.dtype) t = self.solver.time_mapping(t) if ψ0 is None: ψ0 = self._sample_ψ0(x) ψt = self._to_ψt(ψ1=y, t=t, ψ0=ψ0) v = self._to_v(ψt=ψt, t=t, x=x) u = self._to_u(ψ1=y, ψ0=ψ0) losses = dict(l1=F.l1_loss(v, u)) return losses @torch.inference_mode() def sample(self, x, ψ0=None, t0=0.0): """ Args: x: (b c t) Returns: y: (b ... t) """ if ψ0 is None: ψ0 = self._sample_ψ0(x) f = lambda t, ψt, dt: self._to_v(ψt=ψt, t=t, x=x) ψ1 = self.solver(f=f, ψ0=ψ0, t0=t0) return ψ1 def forward(self, x: Tensor, y: Tensor | None = None, ψ0: Tensor | None = None, t0=0.0): if y is None: y = self.sample(x, ψ0=ψ0, t0=t0) else: self.losses = self.compute_losses(x, y, ψ0=ψ0) return y