Initial commit

Browse files

Files changed (5) hide show

layers.py +98 -0
networks.py +226 -0
pix2pix.py +212 -0
pix2pix_disc_ckpt_200.pt +3 -0
pix2pix_gen_ckpt_200.pt +3 -0

layers.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class DownsamplingBlock(nn.Module):
+    """Defines the Unet downsampling block.
+    Consists of Convolution-BatchNorm-ReLU layer with k filters.
+    """
+    def __init__(self, c_in, c_out, kernel_size=4, stride=2,
+                 padding=1, negative_slope=0.2, use_norm=True):
+        """
+        Initializes the UnetDownsamplingBlock.
+        Args:
+            c_in (int): The number of input channels.
+            c_out (int): The number of output channels.
+            kernel_size (int, optional): The size of the convolving kernel. Default is 4.
+            stride (int, optional): Stride of the convolution. Default is 2.
+            padding (int, optional): Zero-padding added to both sides of the input. Default is 0.
+            negative_slope (float, optional): Negative slope for the LeakyReLU activation function. Default is 0.2.
+            use_norm (bool, optinal): If use norm layer. If True add a BatchNorm layer after Conv. Default is True.
+        """
+        super(DownsamplingBlock, self).__init__()
+        block = []
+        block += [nn.Conv2d(in_channels=c_in, out_channels=c_out,
+                          kernel_size=kernel_size, stride=stride, padding=padding,
+                          bias=(not use_norm) # No need to use a bias if there is a batchnorm layer after conv
+                          )]
+        if use_norm:
+            block += [nn.BatchNorm2d(num_features=c_out)]
+        block += [nn.LeakyReLU(negative_slope=negative_slope)]
+        self.conv_block = nn.Sequential(*block)
+    def forward(self, x):
+        return self.conv_block(x)
+class UpsamplingBlock(nn.Module):
+    """Defines the Unet upsampling block.
+    """
+    def __init__(self, c_in, c_out, kernel_size=4, stride=2,
+                 padding=1, use_dropout=False, use_upsampling=False, mode='nearest'):
+        """
+        Initializes the Unet Upsampling Block.
+        Args:
+            c_in (int): The number of input channels.
+            c_out (int): The number of output channels.
+            kernel_size (int, optional): Size of the convolving kernel. Default is 4.
+            stride (int, optional): Stride of the convolution. Default is 2.
+            padding (int, optional): Zero-padding added to both sides of the input. Default is 0.
+            use_dropout (bool, optional): if use dropout layers. Default is False.
+            upsample (bool, optinal): if use upsampling rather than transpose convolution. Default is False.
+            mode (str, optional): the upsampling algorithm: one of 'nearest',
+                'bilinear', 'bicubic'. Default: 'nearest'
+        """
+        super(UpsamplingBlock, self).__init__()
+        block = []
+        if use_upsampling:
+            # Transpose convolution causes checkerboard artifacts. Upsampling
+            # followed by a regular convolutions produces better results appearantly
+            # Please check for further reading: https://distill.pub/2016/deconv-checkerboard/
+            # Odena, et al., "Deconvolution and Checkerboard Artifacts", Distill, 2016. http://doi.org/10.23915/distill.00003
+            mode = mode if mode in ('nearest', 'bilinear', 'bicubic') else 'nearest'
+            block += [nn.Sequential(
+                nn.Upsample(scale_factor=2, mode=mode),
+                nn.Conv2d(in_channels=c_in, out_channels=c_out,
+                          kernel_size=3, stride=1, padding=padding,
+                          bias=False
+                          )
+                )]
+        else:
+            block += [nn.ConvTranspose2d(in_channels=c_in,
+                                         out_channels=c_out,
+                                         kernel_size=kernel_size,
+                                         stride=stride,
+                                         padding=padding, bias=False
+                                         )
+                     ]
+        block += [nn.BatchNorm2d(num_features=c_out)]
+        if use_dropout:
+            block += [nn.Dropout(0.5)]
+        block += [nn.ReLU()]
+        self.conv_block = nn.Sequential(*block)
+    def forward(self, x):
+        return self.conv_block(x)

networks.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .layers import DownsamplingBlock, UpsamplingBlock
+class UnetEncoder(nn.Module):
+    """Create the Unet Encoder Network.
+    C64-C128-C256-C512-C512-C512-C512-C512
+    """
+    def __init__(self, c_in=3, c_out=512):
+        """
+        Constructs the Unet Encoder Network.
+        Ck denote a Convolution-BatchNorm-ReLU layer with k filters.
+            C64-C128-C256-C512-C512-C512-C512-C512
+        Args:
+            c_in (int, optional): Number of input channels.
+            c_out (int, optional): Number of output channels. Default is 512.
+        """
+        super(UnetEncoder, self).__init__()
+        self.enc1 = DownsamplingBlock(c_in, 64, use_norm=False) # C64
+        self.enc2 = DownsamplingBlock(64, 128) # C128
+        self.enc3 = DownsamplingBlock(128, 256) # C256
+        self.enc4 = DownsamplingBlock(256, 512) # C512
+        self.enc5 = DownsamplingBlock(512, 512) # C512
+        self.enc6 = DownsamplingBlock(512, 512) # C512
+        self.enc7 = DownsamplingBlock(512, 512) # C512
+        self.enc8 = DownsamplingBlock(512, c_out) # C512
+    def forward(self, x):
+        x1 = self.enc1(x)
+        x2 = self.enc2(x1)
+        x3 = self.enc3(x2)
+        x4 = self.enc4(x3)
+        x5 = self.enc5(x4)
+        x6 = self.enc6(x5)
+        x7 = self.enc7(x6)
+        x8 = self.enc8(x7)
+        out = [x8, x7, x6, x5, x4, x3, x2, x1] # latest activation is the first element
+        return out
+class UnetDecoder(nn.Module):
+    """Creates the Unet Decoder Network.
+    """
+    def __init__(self, c_in=512, c_out=64, use_upsampling=False, mode='nearest'):
+        """
+        Constructs the Unet Decoder Network.
+        Ck denote a Convolution-BatchNorm-ReLU layer with k filters.
+        CDk denotes a Convolution-BatchNorm-Dropout-ReLU layer with a dropout rate of 50%.
+            CD512-CD1024-CD1024-C1024-C1024-C512-C256-C128
+        Args:
+            c_in (int): Number of input channels.
+            c_out (int, optional): Number of output channels. Default is 512.
+            use_upsampling (bool, optional): Upsampling method for decoder.
+                If True, use upsampling layer followed regular convolution layer.
+                If False, use transpose convolution. Default is False
+            mode (str, optional): the upsampling algorithm: one of 'nearest',
+                'bilinear', 'bicubic'. Default: 'nearest'
+        """
+        super(UnetDecoder, self).__init__()
+        self.dec1 = UpsamplingBlock(c_in, 512, use_dropout=True, use_upsampling=use_upsampling, mode=mode) # CD512
+        self.dec2 = UpsamplingBlock(1024, 512, use_dropout=True, use_upsampling=use_upsampling, mode=mode) # CD1024
+        self.dec3 = UpsamplingBlock(1024, 512, use_dropout=True, use_upsampling=use_upsampling, mode=mode) # CD1024
+        self.dec4 = UpsamplingBlock(1024, 512, use_upsampling=use_upsampling, mode=mode) # C1024
+        self.dec5 = UpsamplingBlock(1024, 256, use_upsampling=use_upsampling, mode=mode) # C1024
+        self.dec6 = UpsamplingBlock(512, 128, use_upsampling=use_upsampling, mode=mode) # C512
+        self.dec7 = UpsamplingBlock(256, 64, use_upsampling=use_upsampling, mode=mode) # C256
+        self.dec8 = UpsamplingBlock(128, c_out, use_upsampling=use_upsampling, mode=mode) # C128
+    def forward(self, x):
+        x9 = torch.cat([x[1], self.dec1(x[0])], 1) # (N,1024,H,W)
+        x10 = torch.cat([x[2], self.dec2(x9)], 1) # (N,1024,H,W)
+        x11 = torch.cat([x[3], self.dec3(x10)], 1) # (N,1024,H,W)
+        x12 = torch.cat([x[4], self.dec4(x11)], 1) # (N,1024,H,W)
+        x13 = torch.cat([x[5], self.dec5(x12)], 1) # (N,512,H,W)
+        x14 = torch.cat([x[6], self.dec6(x13)], 1) # (N,256,H,W)
+        x15 = torch.cat([x[7], self.dec7(x14)], 1) # (N,128,H,W)
+        out = self.dec8(x15) # (N,64,H,W)
+        return out
+class UnetGenerator(nn.Module):
+    """Create a Unet-based generator"""
+    def __init__(self, c_in=3, c_out=3, use_upsampling=False, mode='nearest'):
+        """
+        Constructs a Unet generator
+        Args:
+            c_in (int): The number of input channels.
+            c_out (int): The number of output channels.
+            use_upsampling (bool, optional): Upsampling method for decoder.
+                If True, use upsampling layer followed regular convolution layer.
+                If False, use transpose convolution. Default is False
+            mode (str, optional): the upsampling algorithm: one of 'nearest',
+                'bilinear', 'bicubic'. Default: 'nearest'
+        """
+        super(UnetGenerator, self).__init__()
+        self.encoder = UnetEncoder(c_in=c_in)
+        self.decoder = UnetDecoder(use_upsampling=use_upsampling, mode=mode)
+        # In the paper, the authors state:
+        #   """
+        #       After the last layer in the decoder, a convolution is applied
+        #       to map to the number of output channels (3 in general, except
+        #       in colorization, where it is 2), followed by a Tanh function.
+        #   """
+        # However, in the official Lua implementation, only a Tanh layer is applied.
+        # Therefore, I took the liberty of adding a convolutional layer with a
+        # kernel size of 3.
+        # For more information please check the paper and official github repo:
+        # https://github.com/phillipi/pix2pix
+        # https://arxiv.org/abs/1611.07004
+        self.head = nn.Sequential(
+            nn.Conv2d(in_channels=64, out_channels=c_out,
+                      kernel_size=3, stride=1, padding=1,
+                      bias=True
+                      ),
+            nn.Tanh()
+            )
+    def forward(self, x):
+        outE = self.encoder(x)
+        outD = self.decoder(outE)
+        out = self.head(outD)
+        return out
+class PatchDiscriminator(nn.Module):
+    """Create a PatchGAN discriminator"""
+    def __init__(self, c_in=3, c_hid=64, n_layers=3):
+        """Constructs a PatchGAN discriminator
+        Args:
+            c_in (int, optional): The number of input channels. Defaults to 3.
+            c_hid (int, optional): The number of channels after first conv layer.
+                Defaults to 64.
+            n_layers (int, optional): the number of convolution blocks in the
+                discriminator. Defaults to 3.
+        """
+        super(PatchDiscriminator, self).__init__()
+        model = [DownsamplingBlock(c_in, c_hid, use_norm=False)]
+        n_p = 1  # multiplier for previous channel
+        n_c = 1  # multiplier for current channel
+        # last block is with stride of 1, therefore iterate (n_layers-1) times
+        for n in range(1, n_layers):
+            n_p = n_c
+            n_c = min(2**n, 8)  # The number of channels is 512 at most
+            model += [DownsamplingBlock(c_hid*n_p, c_hid*n_c)]
+        n_p = n_c
+        n_c = min(2**n_layers, 8)
+        model += [DownsamplingBlock(c_hid*n_p, c_hid*n_c, stride=1)] # last block is with stride of 1
+        # last layer is a convolution followed by a Sigmoid function.
+        model += [nn.Conv2d(in_channels=c_hid*n_c, out_channels=1,
+                            kernel_size=4, stride=1, padding=1, bias=True
+                            )]
+        # Normally, there should be a sigmoid layer at the end of discriminator.
+        # However, nn.BCEWithLogitsLoss combines the sigmoid layer with BCE loss,
+        # providing greater numerical stability. Therefore, the discriminator outputs
+        # logits to take advantage of this stability.
+        self.model = nn.Sequential(*model)
+    def forward(self, x):
+        return self.model(x)
+class PixelDiscriminator(nn.Module):
+    """Create a PixelGAN discriminator (1x1 PatchGAN discriminator)"""
+    def __init__(self, c_in=3, c_hid=64):
+        """Constructs a PixelGAN discriminator, a special form of PatchGAN Discriminator.
+        All convolutions are 1x1 spatial filters
+        Args:
+            c_in (int, optional): The number of input channels. Defaults to 3.
+            c_hid (int, optional): The number of channels after first conv layer.
+                Defaults to 64.
+        """
+        super(PixelDiscriminator, self).__init__()
+        self.model = nn.Sequential(
+            DownsamplingBlock(c_in, c_hid, kernel_size=1, stride=1, padding=0, use_norm=False),
+            DownsamplingBlock(c_hid, c_hid*2, kernel_size=1, stride=1, padding=0),
+            nn.Conv2d(in_channels=c_hid*2, out_channels=1, kernel_size=1)
+            )
+        # Similar to PatchDiscriminator, there should be a sigmoid layer at the end of discriminator.
+        # However, nn.BCEWithLogitsLoss combines the sigmoid layer with BCE loss,
+        # providing greater numerical stability. Therefore, the discriminator outputs
+        # logits to take advantage of this stability.
+    def forward(self, x):
+        return self.model(x)
+class PatchGAN(nn.Module):
+    """Create a PatchGAN discriminator"""
+    def __init__(self, c_in=3, c_hid=64, mode='patch', n_layers=3):
+        """Constructs a PatchGAN discriminator.
+        Args:
+            c_in (int, optional): The number of input channels. Defaults to 3.
+            c_hid (int, optional): The number of channels after first
+                convolutional layer. Defaults to 64.
+            mode (str, optional): PatchGAN type. Use 'pixel' for PixelGAN, and
+                'patch' for other types. Defaults to 'patch'.
+            n_layers (int, optional): PatchGAN number of layers. Defaults to 3.
+                - 16x16 PatchGAN if n=1
+                - 34x34 PatchGAN if n=2
+                - 70x70 PatchGAN if n=3
+                - 142x142 PatchGAN if n=4
+                - 286x286 PatchGAN if n=5
+                - 574x574 PatchGAN if n=6
+        """
+        super(PatchGAN, self).__init__()
+        if mode == 'pixel':
+            self.model = PixelDiscriminator(c_in, c_hid)
+        else:
+            self.model = PatchDiscriminator(c_in, c_hid, n_layers)
+    def forward(self, x):
+        return self.model(x)

pix2pix.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .networks import UnetGenerator, PatchGAN
+class Pix2Pix(nn.Module):
+    """Create a Pix2Pix class. It is a model for image to image translation tasks.
+    By default, the model uses a Unet architecture for generator with transposed
+    convolution. The discriminator is 70x70 PatchGAN discriminator, by default.
+     """
+    def __init__(self,
+                 c_in: int = 3,
+                 c_out: int = 3,
+                 is_train: bool = True,
+                 netD: str = 'patch',
+                 lambda_L1: float = 100.0,
+                 is_CGAN: bool = True,
+                 use_upsampling: bool = False,
+                 mode: str = 'nearest',
+                 c_hid: int = 64,
+                 n_layers: int = 3,
+                 lr: float = 0.0002,
+                 beta1: float = 0.5,
+                 beta2: float = 0.999
+                 ):
+        """Constructs the Pix2Pix class.
+        Args:
+            c_in: Number of input channels
+            c_out: Number of output channels
+            is_train: Whether the model is in training mode
+            netD: Type of discriminator ('patch' or 'pixel')
+            lambda_L1: Weight for L1 loss
+            is_CGAN: If True, use conditional GAN architecture
+            use_upsampling: If True, use upsampling in generator instead of transpose conv
+            mode: Upsampling mode ('nearest', 'bilinear', 'bicubic')
+            c_hid: Number of base filters in discriminator
+            n_layers: Number of layers in discriminator
+            lr: Learning rate
+            beta1: Beta1 parameter for Adam optimizer
+            beta2: Beta2 parameter for Adam optimizer
+        """
+        super(Pix2Pix, self).__init__()
+        self.is_CGAN = is_CGAN
+        self.lambda_L1 = lambda_L1
+        self.gen = UnetGenerator(c_in=c_in, c_out=c_out, use_upsampling=use_upsampling, mode=mode)
+        self.gen = self.gen.apply(self.weights_init)
+        if is_train:
+            # Conditional GANs need both input and output together, the total input channel is c_in+c_out
+            disc_in = c_in + c_out if is_CGAN else c_out
+            self.disc = PatchGAN(c_in=disc_in, c_hid=c_hid, mode=netD, n_layers=n_layers)
+            self.disc = self.disc.apply(self.weights_init)
+            # Initialize optimizers
+            self.gen_optimizer = torch.optim.Adam(
+                self.gen.parameters(), lr=lr, betas=(beta1, beta2))
+            self.disc_optimizer = torch.optim.Adam(
+                self.disc.parameters(), lr=lr, betas=(beta1, beta2))
+            # Initialize loss functions
+            self.criterion = nn.BCEWithLogitsLoss()
+            self.criterion_L1 = nn.L1Loss()
+    def forward(self, x):
+        return self.gen(x)
+    @staticmethod
+    def weights_init(m):
+        """Initialize network weights.
+        Args:
+            m: network module
+        """
+        if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
+            nn.init.normal_(m.weight, 0.0, 0.02)
+            if hasattr(m, 'bias') and m.bias is not None:
+                nn.init.constant_(m.bias, 0.0)
+        if isinstance(m, nn.BatchNorm2d):
+            nn.init.normal_(m.weight, 1.0, 0.02)
+            nn.init.constant_(m.bias, 0)
+    def _get_disc_inputs(self, real_images, target_images, fake_images):
+        """Prepare discriminator inputs based on conditional/unconditional setup."""
+        if self.is_CGAN:
+            # Conditional GANs need both input and output together,
+            # Therefore, the total input channel is c_in+c_out
+            real_AB = torch.cat([real_images, target_images], dim=1)
+            fake_AB = torch.cat([real_images,
+                               fake_images.detach()],
+                               dim=1)
+        else:
+            real_AB = target_images
+            fake_AB = fake_images.detach()
+        return real_AB, fake_AB
+    def _get_gen_inputs(self, real_images, fake_images):
+        """Prepare discriminator inputs based on conditional/unconditional setup."""
+        if self.is_CGAN:
+            # Conditional GANs need both input and output together,
+            # Therefore, the total input channel is c_in+c_out
+            fake_AB = torch.cat([real_images,
+                               fake_images],
+                               dim=1)
+        else:
+            fake_AB = fake_images
+        return fake_AB
+    def step_discriminator(self, real_images, target_images, fake_images):
+        """Discriminator forward/backward pass.
+        Args:
+            real_images: Input images
+            target_images: Ground truth images
+            fake_images: Generated images
+        Returns:
+            Discriminator loss value
+        """
+        # Prepare inputs
+        real_AB, fake_AB = self._get_disc_inputs(real_images, target_images,
+                                                fake_images)
+        # Forward pass through the discriminator
+        pred_real = self.disc(real_AB) # D(x, y)
+        pred_fake = self.disc(fake_AB) # D(x, G(x))
+        # Compute the losses
+        lossD_real = self.criterion(pred_real, torch.ones_like(pred_real)) # (D(x, y), 1)
+        lossD_fake = self.criterion(pred_fake, torch.zeros_like(pred_fake)) # (D(x, y), 0)
+        lossD = (lossD_real + lossD_fake) * 0.5 # Combined Loss
+        return lossD
+    def step_generator(self, real_images, target_images, fake_images):
+        """Discriminator forward/backward pass.
+        Args:
+            real_images: Input images
+            target_images: Ground truth images
+            fake_images: Generated images
+        Returns:
+            Discriminator loss value
+        """
+        # Prepare input
+        fake_AB = self._get_gen_inputs(real_images, fake_images)
+        # Forward pass through the discriminator
+        pred_fake = self.disc(fake_AB)
+        # Compute the losses
+        lossG_GaN = self.criterion(pred_fake, torch.ones_like(pred_fake)) # GAN Loss
+        lossG_L1 = self.criterion_L1(fake_images, target_images)           # L1 Loss
+        lossG = lossG_GaN + self.lambda_L1 * lossG_L1                      # Combined Loss
+        # Return total loss and individual components
+        return lossG, {
+            'loss_G': lossG.item(),
+            'loss_G_GAN': lossG_GaN.item(),
+            'loss_G_L1': lossG_L1.item()
+        }
+    def train_step(self, real_images, target_images):
+        """Performs a single training step.
+        Args:
+            real_images: Input images
+            target_images: Ground truth images
+        Returns:
+            Dictionary containing all loss values from this step
+        """
+        # Forward pass through the generator
+        fake_images = self.forward(real_images)
+        # Update discriminator
+        self.disc_optimizer.zero_grad() # Reset the gradients for D
+        lossD = self.stepD(real_images, target_images, fake_images) # Compute the loss
+        lossD.backward()
+        self.disc_optimizer.step() # Update D
+        # Update generator
+        self.gen_optimizer.zero_grad() # Reset the gradients for D
+        lossG, G_losses = self.stepG(real_images, target_images, fake_images) # Compute the loss
+        lossG.backward()
+        self.gen_optimizer.step() # Update D
+        # Return all losses
+        return {
+            'loss_D': lossD.item(),
+            **G_losses
+        }
+    def get_current_visuals(self, real_images, target_images):
+        """Return visualization images.
+        Args:
+            real_images: Input images
+            target_images: Ground truth images
+        Returns:
+            Dictionary containing input, target and generated images
+        """
+        with torch.no_grad():
+            fake_images = self.gen(real_images)
+        return {
+            'real': real_images,
+            'fake': fake_images,
+            'target': target_images
+        }

pix2pix_disc_ckpt_200.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:914a7a2152fabd46a7bcc7aad3fb3e642cd0432df7151b4f04c9cf792fdc831b
+size 11090624

pix2pix_gen_ckpt_200.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db2fda865233203ba13e7c10220bfbf46fa6a92ecb462fb5ad39bf22bbad15af
+size 218246966