Spaces:

crop-classification
/

messis-demo

Sleeping

File size: 45,345 Bytes

5b24075

import torch
import torch.nn as nn
import pytorch_lightning as pl
from torchmetrics import classification
import wandb
from matplotlib import pyplot as plt
import numpy as np
import matplotlib.ticker as ticker
from matplotlib.colors import ListedColormap
from huggingface_hub import PyTorchModelHubMixin
from lion_pytorch import Lion

import json

from messis.prithvi import TemporalViTEncoder, ConvTransformerTokensToEmbeddingNeck, ConvTransformerTokensToEmbeddingBottleneckNeck


def safe_shape(x):
    if isinstance(x, tuple):
        # loop through tuple
        shape_info = '(tuple) : '
        for i in x:
            shape_info += str(i.shape) + ', '
        return shape_info
    if isinstance(x, list):
        # loop through list
        shape_info = '(list) : '
        for i in x:
            shape_info += str(i.shape) + ', '
        return shape_info
    return x.shape

class ConvModule(nn.Module):
    """
    A simple convolutional module including Conv, BatchNorm, and ReLU layers.
    """
    def __init__(self, in_channels, out_channels, kernel_size, padding, dilation):
        super(ConvModule, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=1, padding=padding, dilation=dilation, bias=False)
        self.bn = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        return self.relu(x)

class HierarchicalFCNHead(nn.Module):
    """
    Hierarchical FCN Head for semantic segmentation.
    """
    def __init__(self, in_channels, out_channels, num_classes, num_convs=2, kernel_size=3, dilation=1, dropout_p=0.1, debug=False):
        super(HierarchicalFCNHead, self).__init__()

        self.debug = debug
        
        self.convs = nn.Sequential(*[
            ConvModule(
                in_channels if i == 0 else out_channels,
                out_channels,
                kernel_size,
                padding=dilation * (kernel_size // 2),
                dilation=dilation
            ) for i in range(num_convs)
        ])
        
        self.conv_seg = nn.Conv2d(out_channels, num_classes, kernel_size=1)
        self.dropout = nn.Dropout2d(p=dropout_p)

    def forward(self, x):
        if self.debug:
            print('HierarchicalFCNHead forward INP: ', safe_shape(x))
        x = self.convs(x)
        features = self.dropout(x)
        output = self.conv_seg(features)
        if self.debug:
            print('HierarchicalFCNHead forward features OUT: ', safe_shape(features))
            print('HierarchicalFCNHead forward output OUT: ', safe_shape(output))
        return output, features

class LabelRefinementHead(nn.Module):
    """
    Similar to the label refinement module introduced in the ZueriCrop paper, this module refines the predictions for tier 3.
    It takes the raw predictions from head 1, head 2 and head 3 and refines them to produce the final prediction for tier 3.
    According to ZueriCrop, this helps with making the predictions more consistent across the different tiers.
    """
    def __init__(self, input_channels, num_classes):
        super(LabelRefinementHead, self).__init__()
        
        self.cnn_layers = nn.Sequential(
            # 1x1 Convolutional layer
            nn.Conv2d(in_channels=input_channels, out_channels=128, kernel_size=1, stride=1, padding=0),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            
            # 3x3 Convolutional layer
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),

            # Skip connection (implemented in forward method)
            
            # Another 3x3 Convolutional layer
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            
            # 1x1 Convolutional layer to adjust the number of output channels to num_classes
            nn.Conv2d(in_channels=128, out_channels=num_classes, kernel_size=1, stride=1, padding=0),
            nn.Dropout(p=0.5)
        )
        
    def forward(self, x):
        # Apply initial conv layer
        y = self.cnn_layers[0:3](x)

        # Save for skip connection
        y_skip = y

        # Apply the next two conv layers
        y = self.cnn_layers[3:9](y)

        # Skip connection (element-wise addition)
        y = y + y_skip

        # Apply the last conv layer
        y = self.cnn_layers[9:](y)
        return y

class HierarchicalClassifier(nn.Module):
    def __init__(
            self, 
            heads_spec,
            dropout_p=0.1,
            img_size=256, 
            patch_size=16, 
            num_frames=3,
            bands=[0, 1, 2, 3, 4, 5], 
            backbone_weights_path=None, 
            freeze_backbone=True, 
            use_bottleneck_neck=False,
            bottleneck_reduction_factor=4,
            loss_ignore_background=False,
            debug=False
        ):
        super(HierarchicalClassifier, self).__init__()

        self.embed_dim = 768
        if num_frames % 3 != 0:
            raise ValueError("The number of frames must be a multiple of 3, it is currently: ", num_frames)
        self.num_frames = num_frames
        self.hp, self.wp = img_size // patch_size, img_size // patch_size
        self.heads_spec = heads_spec
        self.dropout_p = dropout_p
        self.loss_ignore_background = loss_ignore_background
        self.debug = debug

        if self.debug:
            print('hp and wp: ', self.hp, self.wp)

        self.prithvi = TemporalViTEncoder(
            img_size=img_size,
            patch_size=patch_size,
            num_frames=3,
            tubelet_size=1,
            in_chans=len(bands),
            embed_dim=self.embed_dim,
            depth=12,
            num_heads=8,
            mlp_ratio=4.0,
            norm_pix_loss=False,
            pretrained=backbone_weights_path,
            debug=self.debug
        )

        # (Un)freeze the backbone
        for param in self.prithvi.parameters():
            param.requires_grad = not freeze_backbone

        # Neck to transform the token-based output of the transformer into a spatial feature map
        number_of_necks = self.num_frames // 3
        if use_bottleneck_neck:
            self.necks = nn.ModuleList([ConvTransformerTokensToEmbeddingBottleneckNeck(
                embed_dim=self.embed_dim * 3,
                output_embed_dim=self.embed_dim * 3,
                drop_cls_token=True,
                Hp=self.hp,
                Wp=self.wp,
                bottleneck_reduction_factor=bottleneck_reduction_factor
            ) for _ in range(number_of_necks)])
        else:
            self.necks = nn.ModuleList([ConvTransformerTokensToEmbeddingNeck(
                embed_dim=self.embed_dim * 3,
                output_embed_dim=self.embed_dim * 3,
                drop_cls_token=True,
                Hp=self.hp,
                Wp=self.wp,
            ) for _ in range(number_of_necks)])

        # Initialize heads and loss weights based on tiers
        self.heads = nn.ModuleDict()
        self.loss_weights = {}
        self.total_classes = 0

        # Build HierarchicalFCNHeads
        head_count = 0
        for head_name, head_info in self.heads_spec.items():
            head_type = head_info['type']
            num_classes = head_info['num_classes_to_predict']
            loss_weight = head_info['loss_weight']

            if head_type == 'HierarchicalFCNHead':
                num_classes = head_info['num_classes_to_predict']
                loss_weight = head_info['loss_weight']
                kernel_size = head_info.get('kernel_size', 3)
                num_convs = head_info.get('num_convs', 1)
                num_channels = head_info.get('num_channels', 256)
                self.total_classes += num_classes

                self.heads[head_name] = HierarchicalFCNHead(
                    in_channels=(self.embed_dim * self.num_frames) if head_count == 0 else num_channels,
                    out_channels=num_channels,
                    num_classes=num_classes,
                    num_convs=num_convs,
                    kernel_size=kernel_size,
                    dropout_p=self.dropout_p,
                    debug=self.debug
                )
                self.loss_weights[head_name] = loss_weight

            # NOTE: LabelRefinementHead must be the last in the dict, otherwise the total_classes will be incorrect
            if head_type == 'LabelRefinementHead':
                self.refinement_head = LabelRefinementHead(input_channels=self.total_classes, num_classes=num_classes)
                self.refinement_head_name = head_name
                self.loss_weights[head_name] = loss_weight

            head_count += 1

        self.loss_func = nn.CrossEntropyLoss(ignore_index=-1)

    def forward(self, x):
        if self.debug:
            print(f"Input shape: {safe_shape(x)}") # torch.Size([4, 6, 9, 224, 224])

        # Extract features from the base model
        if len(self.necks) == 1:
            features = [x]
        else:
            features = torch.chunk(x, len(self.necks), dim=2)
        features = [self.prithvi(x) for x in features]

        if self.debug:
            print(f"Features shape after base model: {', '.join([safe_shape(f) for f in features])}") # (tuple) : torch.Size([4, 589, 768]), , (tuple) : torch.Size

        # Process through the neck
        features = [neck(feat_) for feat_, neck in zip(features, self.necks)]

        if self.debug:
            print(f"Features shape after neck: {', '.join([safe_shape(f) for f in features])}") # (tuple) : torch.Size([4, 2304, 224, 224]), , (tuple) : torch.Size

        # Remove from tuple
        features = [feat[0] for feat in features]
        # stack the features to create a tensor of torch.Size([4, 6912, 224, 224])
        features = torch.concatenate(features, dim=1)
        if self.debug:
            print(f"Features shape after removing tuple: {safe_shape(features)}") # torch.Size([4, 6912, 224, 224])

        # Process through the heads
        outputs = {}
        for tier_name, head in self.heads.items():
            output, features = head(features)
            outputs[tier_name] = output

            if self.debug:
                print(f"Features shape after {tier_name} head: {safe_shape(features)}")
                print(f"Output shape after {tier_name} head: {safe_shape(output)}")

        # Process through the classification refinement head
        output_concatenated = torch.cat(list(outputs.values()), dim=1)
        output_refinement_head = self.refinement_head(output_concatenated)
        outputs[self.refinement_head_name] = output_refinement_head

        return outputs

    def calculate_loss(self, outputs, targets):
        total_loss = 0
        loss_per_head = {}
        for head_name, output in outputs.items():
            if self.debug:
                print(f"Target index for {head_name}: {self.heads_spec[head_name]['target_idx']}")
            target = targets[self.heads_spec[head_name]['target_idx']]
            loss_target = target
            if self.loss_ignore_background:
                loss_target = target.clone()  # Clone as original target needed in backward pass
                loss_target[loss_target == 0] = -1  # Set background class to ignore_index -1 for loss calculation
            loss = self.loss_func(output, loss_target)
            loss_per_head[f'{head_name}'] = loss
            total_loss += loss * self.loss_weights[head_name]
        
        return total_loss, loss_per_head

class Messis(pl.LightningModule, PyTorchModelHubMixin):
    def __init__(self, hparams):
        super().__init__()
        self.save_hyperparameters(hparams)

        self.model = HierarchicalClassifier(
            heads_spec=hparams['heads_spec'],
            dropout_p=hparams.get('dropout_p'),
            img_size=hparams.get('img_size'),
            patch_size=hparams.get('patch_size'),
            num_frames=hparams.get('num_frames'),
            bands=hparams.get('bands'),
            backbone_weights_path=hparams.get('backbone_weights_path'),
            freeze_backbone=hparams['freeze_backbone'],
            use_bottleneck_neck=hparams.get('use_bottleneck_neck'),
            bottleneck_reduction_factor=hparams.get('bottleneck_reduction_factor'),
            loss_ignore_background=hparams.get('loss_ignore_background'),
            debug=hparams.get('debug')
        )

    def forward(self, x):
        return self.model(x)
    
    def training_step(self, batch, batch_idx):
        return self.__step(batch, batch_idx, "train")

    def validation_step(self, batch, batch_idx):
        return self.__step(batch, batch_idx, "val")
    
    def test_step(self, batch, batch_idx):
        return self.__step(batch, batch_idx, "test")
        
    def configure_optimizers(self):
        # select case on optimizer
        match self.hparams.get('optimizer', 'Adam'):
            case 'Adam':
                optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.get('lr', 1e-3))
            case 'AdamW':
                optimizer = torch.optim.AdamW(self.parameters(), lr=self.hparams.get('lr', 1e-3), weight_decay=self.hparams.get('optimizer_weight_decay', 0.01))
            case 'SGD':
                optimizer = torch.optim.SGD(self.parameters(), lr=self.hparams.get('lr', 1e-3), momentum=self.hparams.get('optimizer_momentum', 0.9))
            case 'Lion':
                # https://github.com/lucidrains/lion-pytorch | Typically lr 3-10 times lower than Adam and weight_decay 3-10 times higher
                optimizer = Lion(self.parameters(), lr=self.hparams.get('lr', 1e-4), weight_decay=self.hparams.get('optimizer_weight_decay', 0.1))
            case _:
                raise ValueError(f"Optimizer {self.hparams.get('optimizer')} not supported")
        return optimizer

    def __step(self, batch, batch_idx, stage):
        inputs, targets = batch
        targets = torch.stack(targets[0])
        outputs = self(inputs)
        loss, loss_per_head = self.model.calculate_loss(outputs, targets)
        loss_per_head_named = {f'{stage}_loss_{head}': loss_per_head[head] for head in loss_per_head}
        loss_proportions = { f'{stage}_loss_{head}_proportion': round(loss_per_head[head].item() / loss.item(), 2) for head in loss_per_head}
        loss_detail_dict = {**loss_per_head_named, **loss_proportions}

        if self.hparams.get('debug'):
            print(f"Step Inputs shape: {safe_shape(inputs)}")
            print(f"Step Targets shape: {safe_shape(targets)}")
            print(f"Step Outputs dict keys: {outputs.keys()}")

        # NOTE: All metrics other than loss are tracked by callbacks (LogMessisMetrics)
        self.log_dict({f'{stage}_loss': loss, **loss_detail_dict}, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return {'loss': loss, 'outputs': outputs}
        
class LogConfusionMatrix(pl.Callback):
    def __init__(self, hparams, dataset_info_file, debug=False):
        super().__init__()

        assert hparams.get('heads_spec') is not None, "heads_spec must be defined in the hparams"
        self.tiers_dict = {k: v for k, v in hparams.get('heads_spec').items() if v.get('is_metrics_tier', False)}
        self.last_tier_name = next((k for k, v in hparams.get('heads_spec').items() if v.get('is_last_tier', False)), None)
        self.final_head_name = next((k for k, v in hparams.get('heads_spec').items() if v.get('is_final_head', False)), None)

        assert self.last_tier_name is not None, "No tier found with 'is_last_tier' set to True"
        assert self.final_head_name is not None, "No head found with 'is_final_head' set to True"

        self.tiers = list(self.tiers_dict.keys())
        self.phases = ['train', 'val', 'test']
        self.modes = ['pixelwise', 'majority']
        self.debug = debug

        if debug:
            print(f"Final head identified as: {self.final_head_name}")
            print(f"LogConfusionMatrix Metrics over | Phases: {self.phases}, Tiers: {self.tiers}, Modes: {self.modes}")
        
        with open(dataset_info_file, 'r') as f:
            self.dataset_info = json.load(f)

        # Initialize confusion matrices
        self.metrics_to_compute = ['confusion_matrix']
        self.metrics = {phase: {tier: {mode: self.__init_metrics(tier, phase) for mode in self.modes} for tier in self.tiers} for phase in self.phases}

    def __init_metrics(self, tier, phase):
        num_classes = self.tiers_dict[tier]['num_classes_to_predict']
        confusion_matrix = classification.MulticlassConfusionMatrix(num_classes=num_classes)

        return {
            'confusion_matrix': confusion_matrix
        }

    def setup(self, trainer, pl_module, stage=None):
        # Move all metrics to the correct device at the start of the training/validation
        device = pl_module.device
        for phase_metrics in self.metrics.values():
            for tier_metrics in phase_metrics.values():
                for mode_metrics in tier_metrics.values():
                    for metric in self.metrics_to_compute:
                        mode_metrics[metric].to(device)

    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
        self.__update_confusion_matrices(trainer, pl_module, outputs, batch, batch_idx, 'train')

    def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
        self.__update_confusion_matrices(trainer, pl_module, outputs, batch, batch_idx, 'val')

    def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
        self.__update_confusion_matrices(trainer, pl_module, outputs, batch, batch_idx, 'test')

    def __update_confusion_matrices(self, trainer, pl_module, outputs, batch, batch_idx, phase):
        if trainer.sanity_checking:
            return

        targets = torch.stack(batch[1][0]) # (tiers, batch, H, W)
        outputs = outputs['outputs'][self.final_head_name] # (batch, C, H, W)
        field_ids = batch[1][1].permute(1, 0, 2, 3)[0]

        pixelwise_outputs, majority_outputs = LogConfusionMatrix.get_pixelwise_and_majority_outputs(outputs, self.tiers, field_ids, self.dataset_info)        
        
        for preds, mode in zip([pixelwise_outputs, majority_outputs], self.modes):
            # Update all metrics
            assert len(preds) == len(targets), f"Number of predictions and targets do not match: {len(preds)} vs {len(targets)}"
            assert len(preds) == len(self.tiers), f"Number of predictions and tiers do not match: {len(preds)} vs {len(self.tiers)}"
            
            for pred, target, tier in zip(preds, targets, self.tiers):
                if self.debug:
                    print(f"Updating confusion matrix for {phase} {tier} {mode}")
                metrics = self.metrics[phase][tier][mode]
                # flatten and remove background class if the mode is majority (such that the background class is not included in the confusion matrix)
                if mode == 'majority':
                    pred = pred[target != 0]
                    target = target[target != 0]
                metrics['confusion_matrix'].update(pred, target)


    @staticmethod
    def get_pixelwise_and_majority_outputs(refinement_head_outputs, tiers, field_ids, dataset_info):
        """
        Get the pixelwise and majority predictions from the model outputs.
        The pixelwise tier predictions are derived from the refinement_head_outputs predictions. 
        The majority last tier predictions are derived from the refinement_head_outputs. And then the majority lower-tier predictions are derived from the majority highest-tier predictions.

        Also sets the background to 0 for all field majority predictions (regardless of what the model predicts for the background class).
        As this is a classification task and not a segmentation task and the field boundaries are known beforehand and not of any interest.

        Args:
            refinement_head_outputs (torch.Tensor(batch, C, H, W)): The probability outputs from the model for the refined tier.
            tiers (list of str): List of tiers e.g. ['tier1', 'tier2', 'tier3'].
            field_ids (torch.Tensor(batch, H, W)): The field IDs for each prediction.
            dataset_info (dict): The dataset information.

        Returns:
            torch.Tensor(tiers, batch, H, W): The pixelwise predictions.
            torch.Tensor(tiers, batch, H, W): The majority predictions.
        """
        
        # Assuming the highest tier is the last one in the list
        highest_tier = tiers[-1]

        pixelwise_highest_tier = torch.softmax(refinement_head_outputs, dim=1).argmax(dim=1)  # (batch, H, W)
        majority_highest_tier = LogConfusionMatrix.get_field_majority_preds(refinement_head_outputs, field_ids)

        tier_mapping = {tier: dataset_info[f'{highest_tier}_to_{tier}'] for tier in tiers if tier != highest_tier}

        pixelwise_outputs = {highest_tier: pixelwise_highest_tier}
        majority_outputs = {highest_tier: majority_highest_tier}

        # Initialize pixelwise and majority outputs for each tier
        for tier in tiers:
            if tier != highest_tier:
                pixelwise_outputs[tier] = torch.zeros_like(pixelwise_highest_tier)
                majority_outputs[tier] = torch.zeros_like(majority_highest_tier)

        # Map the highest tier to lower tiers
        for i, mappings in enumerate(zip(*tier_mapping.values())):
            for j, tier in enumerate(tier_mapping.keys()):
                pixelwise_outputs[tier][pixelwise_highest_tier == i] = mappings[j]
                majority_outputs[tier][majority_highest_tier == i] = mappings[j]

        pixelwise_outputs_stacked = torch.stack([pixelwise_outputs[tier] for tier in tiers])
        majority_outputs_stacked = torch.stack([majority_outputs[tier] for tier in tiers])

        # Ensure these are tensors
        assert isinstance(pixelwise_outputs_stacked, torch.Tensor), "pixelwise_outputs_stacked is not a tensor"
        assert isinstance(majority_outputs_stacked, torch.Tensor), "majority_outputs_stacked is not a tensor"

        return pixelwise_outputs_stacked, majority_outputs_stacked


    @staticmethod
    def get_field_majority_preds(output, field_ids):
        """
        Get the majority prediction for each field in the batch. The majority excludes the background class.

        Args:
            output (torch.Tensor(batch, C, H, W)): The probability outputs from the model (tier3_refined)
            field_ids (torch.Tensor(batch, H, W)): The field IDs for each prediction.

        Returns:
            torch.Tensor(batch, H, W): The majority predictions.
        """
        # remove the background class
        pixelwise = torch.softmax(output[:, 1:, :, :], dim=1).argmax(dim=1) + 1  # (batch, H, W)
        majority_preds = torch.zeros_like(pixelwise)
        for batch in range(len(pixelwise)):
            field_ids_batch = field_ids[batch]
            for field_id in np.unique(field_ids_batch.cpu().numpy()):
                if field_id == 0:
                    continue
                field_mask = field_ids_batch == field_id
                flattened_pred = pixelwise[batch][field_mask].view(-1)  # Flatten the prediction
                flattened_pred = flattened_pred[flattened_pred != 0]  # Exclude background class
                if len(flattened_pred) == 0:
                    continue
                mode_pred, _ = torch.mode(flattened_pred) # Compute mode prediction
                majority_preds[batch][field_mask] = mode_pred.item()
        return majority_preds

    def on_train_epoch_end(self, trainer, pl_module):
        # Log and then reset the confusion matrices after training epoch
        self.__log_and_reset_confusion_matrices(trainer, pl_module, 'train')

    def on_validation_epoch_end(self, trainer, pl_module):
        # Log and then reset the confusion matrices after validation epoch
        self.__log_and_reset_confusion_matrices(trainer, pl_module, 'val')

    def on_test_epoch_end(self, trainer, pl_module):
        # Log and then reset the confusion matrices after test epoch
        self.__log_and_reset_confusion_matrices(trainer, pl_module, 'test')

    def __log_and_reset_confusion_matrices(self, trainer, pl_module, phase):
        if trainer.sanity_checking:
            return

        for tier in self.tiers:
            for mode in self.modes:
                metrics = self.metrics[phase][tier][mode]
                confusion_matrix = metrics['confusion_matrix']
                if self.debug:
                    print(f"Logging and resetting confusion matrix for {phase} {tier} Update count: {confusion_matrix._update_count}")
                matrix = confusion_matrix.compute()  # columns are predictions and rows are targets

                # Calculate percentages
                matrix = matrix.float()
                row_sums = matrix.sum(dim=1, keepdim=True)
                matrix_percent = matrix / row_sums

                # Ensure percentages sum to 1 for each row or handle NaNs
                row_sum_check = matrix_percent.sum(dim=1)
                valid_rows = ~torch.isnan(row_sum_check)
                if valid_rows.any():
                    assert torch.allclose(row_sum_check[valid_rows], torch.ones_like(row_sum_check[valid_rows]), atol=1e-2), "Percentages do not sum to 1 for some valid rows"
                    
                # Sort the matrix and labels by the total number of instances
                sorted_indices = row_sums.squeeze().argsort(descending=True)
                matrix_percent = matrix_percent[sorted_indices, :] # sort rows
                matrix_percent = matrix_percent[:, sorted_indices] # sort columns
                class_labels = [self.dataset_info[tier][i] for i in sorted_indices]
                row_sums_sorted = row_sums[sorted_indices]

                # Check for zero rows after sorting
                zero_rows = (row_sums_sorted == 0).squeeze()

                fig, ax = plt.subplots(figsize=(matrix.size(0), matrix.size(0)), dpi=140)

                ax.matshow(matrix_percent.cpu().numpy(), cmap='viridis')

                ax.xaxis.set_major_locator(ticker.FixedLocator(range(matrix.size(1) + 1)))
                ax.yaxis.set_major_locator(ticker.FixedLocator(range(matrix.size(0) + 1)))

                ax.set_xticklabels(class_labels + [''], rotation=45)
                ax.set_yticklabels(class_labels + [''])

                # Add total number of instances to the y-axis labels
                y_labels = [f'{class_labels[i]} [n={int(row_sums_sorted[i].item()):,.0f}]'.replace(',', "'") for i in range(matrix.size(0))]
                ax.set_yticklabels(y_labels + [''])

                ax.set_xlabel('Predictions')
                ax.set_ylabel('Targets')

                # Move x-axis label and ticks to the top
                ax.xaxis.set_label_position('top')
                ax.xaxis.set_ticks_position('top')

                fig.tight_layout()

                for i in range(matrix.size(0)):
                    for j in range(matrix.size(1)):
                        if zero_rows[i]:
                            ax.text(j, i, 'N/A', ha='center', va='center', color='black')
                        else:
                            ax.text(j, i, f'{matrix_percent[i, j]:.2f}', ha='center', va='center', color='#F88379', weight='bold') # coral red
                trainer.logger.experiment.log({f"{phase}_{tier}_confusion_matrix_{mode}": wandb.Image(fig)})
                plt.close()
                confusion_matrix.reset()

class LogMessisMetrics(pl.Callback):
    def __init__(self, hparams, dataset_info_file, debug=False):
        super().__init__()

        assert hparams.get('heads_spec') is not None, "heads_spec must be defined in the hparams"
        self.tiers_dict = {k: v for k, v in hparams.get('heads_spec').items() if v.get('is_metrics_tier', False)}
        self.last_tier_name = next((k for k, v in hparams.get('heads_spec').items() if v.get('is_last_tier', False)), None)
        self.final_head_name = next((k for k, v in hparams.get('heads_spec').items() if v.get('is_final_head', False)), None)

        assert self.last_tier_name is not None, "No tier found with 'is_last_tier' set to True"
        assert self.final_head_name is not None, "No head found with 'is_final_head' set to True"

        self.tiers = list(self.tiers_dict.keys())
        self.phases = ['train', 'val', 'test']
        self.modes = ['pixelwise', 'majority']
        self.debug = debug

        if debug:
            print(f"Last tier identified as: {self.last_tier_name}")
            print(f"Final head identified as: {self.final_head_name}")
            print(f"LogMessisMetrics Metrics over | Phases: {self.phases}, Tiers: {self.tiers}, Modes: {self.modes}")

        with open(dataset_info_file, 'r') as f:
            self.dataset_info = json.load(f)

        # Initialize metrics
        self.metrics_to_compute = ['accuracy', 'weighted_accuracy', 'precision', 'weighted_precision', 'recall', 'weighted_recall' ,'f1', 'weighted_f1', 'cohen_kappa']
        self.metrics = {phase: {tier: {mode: self.__init_metrics(tier, phase) for mode in self.modes} for tier in self.tiers} for phase in self.phases}
        self.images_to_log = {phase: {mode: None for mode in self.modes} for phase in self.phases}
        self.images_to_log_targets = {phase: None for phase in self.phases}
        self.field_ids_to_log_targets = {phase: None for phase in self.phases}
        self.inputs_to_log = {phase: None for phase in self.phases}

    def __init_metrics(self, tier, phase):
        num_classes = self.tiers_dict[tier]['num_classes_to_predict']

        accuracy = classification.MulticlassAccuracy(num_classes=num_classes, average='macro')
        weighted_accuracy = classification.MulticlassAccuracy(num_classes=num_classes, average='weighted')
        per_class_accuracies = {
            class_index: classification.BinaryAccuracy() for class_index in range(num_classes)
        }
        precision = classification.MulticlassPrecision(num_classes=num_classes, average='macro')
        weighted_precision = classification.MulticlassPrecision(num_classes=num_classes, average='weighted')
        recall = classification.MulticlassRecall(num_classes=num_classes, average='macro')
        weighted_recall = classification.MulticlassRecall(num_classes=num_classes, average='weighted')
        f1 = classification.MulticlassF1Score(num_classes=num_classes, average='macro')
        weighted_f1 = classification.MulticlassF1Score(num_classes=num_classes, average='weighted')
        cohen_kappa = classification.MulticlassCohenKappa(num_classes=num_classes)

        return {
            'accuracy': accuracy,
            'weighted_accuracy': weighted_accuracy,
            'per_class_accuracies': per_class_accuracies,
            'precision': precision,
            'weighted_precision': weighted_precision,
            'recall': recall,
            'weighted_recall': weighted_recall,
            'f1': f1,
            'weighted_f1': weighted_f1,
            'cohen_kappa': cohen_kappa
        }

    def setup(self, trainer, pl_module, stage=None):
        # Move all metrics to the correct device at the start of the training/validation
        device = pl_module.device
        for phase_metrics in self.metrics.values():
            for tier_metrics in phase_metrics.values():
                for mode_metrics in tier_metrics.values():
                    for metric in self.metrics_to_compute:
                        mode_metrics[metric].to(device)
                    for class_accuracy in mode_metrics['per_class_accuracies'].values():
                        class_accuracy.to(device)

    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
        self.__on_batch_end(trainer, pl_module, outputs, batch, batch_idx, 'train')

    def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
        self.__on_batch_end(trainer, pl_module, outputs, batch, batch_idx, 'val')

    def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
        self.__on_batch_end(trainer, pl_module, outputs, batch, batch_idx, 'test')

    def __on_batch_end(self, trainer: pl.Trainer, pl_module, outputs, batch, batch_idx, phase):
        if trainer.sanity_checking:
            return
        if self.debug:
            print(f"{phase} batch ended. Updating metrics...")

        targets = torch.stack(batch[1][0]) # (tiers, batch, H, W)
        outputs = outputs['outputs'][self.final_head_name] # (batch, C, H, W)        
        field_ids = batch[1][1].permute(1, 0, 2, 3)[0]

        pixelwise_outputs, majority_outputs = LogConfusionMatrix.get_pixelwise_and_majority_outputs(outputs, self.tiers, field_ids, self.dataset_info)        

        for preds, mode in zip([pixelwise_outputs, majority_outputs], self.modes):

            # Update all metrics
            assert preds.shape == targets.shape, f"Shapes of predictions and targets do not match: {preds.shape} vs {targets.shape}"
            assert preds.shape[0] == len(self.tiers), f"Number of tiers in predictions and tiers do not match: {preds.shape[0]} vs {len(self.tiers)}"
        
            self.images_to_log[phase][mode] = preds[-1]
            
            for pred, target, tier in zip(preds, targets, self.tiers):
                # flatten and remove background class if the mode is majority (such that the background class is not considered in the metrics)
                if mode == 'majority':
                    pred = pred[target != 0]
                    target = target[target != 0]
                metrics = self.metrics[phase][tier][mode]
                for metric in self.metrics_to_compute:
                    metrics[metric].update(pred, target)
                    if self.debug:
                        print(f"{phase} {tier} {mode} {metric} updated. Update count: {metrics[metric]._update_count}")
                self.__update_per_class_metrics(pred, target, metrics['per_class_accuracies'])

        self.images_to_log_targets[phase] = targets[-1]
        self.field_ids_to_log_targets[phase] = field_ids
        self.inputs_to_log[phase] = batch[0]

    def __update_per_class_metrics(self, preds, targets, per_class_accuracies):
        for class_index, class_accuracy in per_class_accuracies.items():
            if not (targets == class_index).any():
                continue
            
            if class_index == 0:
                # Mask out non-background elements for background class (0)
                class_mask = targets != 0
            else:
                # Mask out background elements for other classes
                class_mask = targets == 0

            preds_fields = preds[~class_mask]
            targets_fields = targets[~class_mask]

            # Prepare for binary classification (needs to be float)
            preds_class = (preds_fields == class_index).float()
            targets_class = (targets_fields == class_index).float()

            class_accuracy.update(preds_class, targets_class)

            if self.debug:
                print(f"Shape of preds_fields: {preds_fields.shape}")
                print(f"Shape of targets_fields: {targets_fields.shape}")
                print(f"Unique values in preds_fields: {torch.unique(preds_fields)}")
                print(f"Unique values in targets_fields: {torch.unique(targets_fields)}")
                print(f"Per-class metrics for class {class_index} updated. Update count: {per_class_accuracies[class_index]._update_count}")

    def on_train_epoch_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        self.__on_epoch_end(trainer, pl_module, 'train')

    def on_validation_epoch_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        self.__on_epoch_end(trainer, pl_module, 'val')

    def on_test_epoch_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        self.__on_epoch_end(trainer, pl_module, 'test')

    def __on_epoch_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule, phase):
        if trainer.sanity_checking:
            return # Skip during sanity check (avoid warning about metric compute being called before update)
        for tier in self.tiers:
            for mode in self.modes:
                metrics = self.metrics[phase][tier][mode]

                # Calculate and reset in tier: Accuracy, WeightedAccuracy, Precision, Recall, F1, Cohen's Kappa
                metrics_dict = {metric: metrics[metric].compute() for metric in self.metrics_to_compute}
                pl_module.log_dict({f"{phase}_{metric}_{tier}_{mode}": v for metric, v in metrics_dict.items()}, on_step=False, on_epoch=True)
                for metric in self.metrics_to_compute:
                    metrics[metric].reset()

                # Per-class metrics
                # NOTE: Some literature reports "per class accuracy" but what they actually mean is "per class recall".
                # Using the accuracy formula per class has no value in our imbalanced multi-class setting (TN's inflate scores!)
                # We calculate all 4 metrics. This allows us to calculate any macro/micro score later if needed.
                class_metrics = []
                class_names_mapping = self.dataset_info[tier.split('_')[0] if '_refined' in tier else tier] 
                for class_index, class_accuracy in metrics['per_class_accuracies'].items():
                    if class_accuracy._update_count == 0:
                        continue  # Skip if no updates have been made
                    tp, tn, fp, fn = class_accuracy.tp, class_accuracy.tn, class_accuracy.fp, class_accuracy.fn
                    recall = (tp / (tp + fn)).item() if tp + fn > 0 else 0
                    precision = (tp / (tp + fp)).item() if tp + fp > 0 else 0
                    f1 = (2 * (precision * recall) / (precision + recall)) if precision + recall > 0 else 0
                    n_of_class = (tp + fn).item()
                    class_metrics.append([class_index, class_names_mapping[class_index], precision, recall, f1, class_accuracy.compute().item(), n_of_class])
                    class_accuracy.reset()
                wandb_table = wandb.Table(data=class_metrics, columns=["Class Index", "Class Name", "Precision", "Recall", "F1", "Accuracy", "N"])
                trainer.logger.experiment.log({f"{phase}_per_class_metrics_{tier}_{mode}": wandb_table})

        # use the same n_classes for all images, such that they are comparable
        n_classes = max([
            torch.max(self.images_to_log_targets[phase]),
            torch.max(self.images_to_log[phase]["majority"]),
            torch.max(self.images_to_log[phase]["pixelwise"])
        ])
        images     = [LogMessisMetrics.process_images(self.images_to_log[phase][mode], n_classes) for mode in self.modes]
        images.append(LogMessisMetrics.create_positive_negative_image(self.images_to_log[phase]["majority"], self.images_to_log_targets[phase]))
        images.append(LogMessisMetrics.process_images(self.images_to_log_targets[phase], n_classes))
        images.append(LogMessisMetrics.process_images(self.field_ids_to_log_targets[phase].cpu()))

        examples = []
        for i in range(len(images[0])):
            example = np.concatenate([img[i] for img in images], axis=0)
            examples.append(wandb.Image(example, caption=f"From Top to Bottom: {self.modes[0]}, {self.modes[1]}, right/wrong classifications, target, fields"))

        trainer.logger.experiment.log({f"{phase}_examples": examples})

        # Log segmentation masks
        batch_input_data = self.inputs_to_log[phase].cpu() # shape [BS, 6, N_TIMESTEPS, 224, 224]
        ground_truth_masks = self.images_to_log_targets[phase].cpu().numpy()
        pixel_wise_masks = self.images_to_log[phase]["pixelwise"].cpu().numpy()
        field_majority_masks = self.images_to_log[phase]["majority"].cpu().numpy()
        correctness_masks = self.create_positive_negative_segmentation_mask(field_majority_masks, ground_truth_masks)
        class_labels = {idx: name for idx, name in enumerate(self.dataset_info[self.last_tier_name])}

        segmentation_masks = []
        for input_data, ground_truth_mask, pixel_wise_mask, field_majority_mask, correctness_mask in zip(batch_input_data, ground_truth_masks, pixel_wise_masks, field_majority_masks, correctness_masks):
            middle_timestep_index = input_data.shape[1] // 2  # Get the middle timestamp index
            gamma = 2.5  # Gamma for brightness adjustment
            rgb_image = input_data[:3, middle_timestep_index, :, :].permute(1, 2, 0).numpy()  # Shape [224, 224, 3]
            rgb_image = (rgb_image - rgb_image.min()) / (rgb_image.max() - rgb_image.min())
            rgb_image = np.power(rgb_image, 1.0 / gamma)
            rgb_image = (rgb_image * 255).astype(np.uint8)

            mask_img = wandb.Image(
                rgb_image,
                masks={
                    "predictions_pixel_wise": {"mask_data": pixel_wise_mask, "class_labels": class_labels},
                    "predictions_field_majority": {"mask_data": field_majority_mask, "class_labels": class_labels},
                    "ground_truth": {"mask_data": ground_truth_mask, "class_labels": class_labels},
                    "correctness": {"mask_data": correctness_mask, "class_labels": { 0: "Background", 1: "Wrong", 2: "Right" }},
                },
            )
            segmentation_masks.append(mask_img)

        trainer.logger.experiment.log({f"{phase}_segmentation_mask": segmentation_masks})

        if self.debug:
            print(f"{phase} epoch ended. Logging & resetting metrics...", trainer.sanity_checking)

    @staticmethod
    def create_positive_negative_segmentation_mask(field_majority_masks, ground_truth_masks):
        """
        Create a tensor that shows the positive and negative classifications of the model.

        Args:
            field_majority_masks (np.ndarray): The field majority masks generated by the model.
            ground_truth_masks (np.ndarray): The ground truth masks.

        Returns:
            np.ndarray: An array with values:
                - 0 where the target is 0,
                - 2 where the prediction matches the target,
                - 1 where the prediction does not match the target.
        """
        correctness_mask = np.zeros_like(ground_truth_masks, dtype=int)

        matches = (field_majority_masks == ground_truth_masks) & (ground_truth_masks != 0)
        correctness_mask[matches] = 2

        mismatches = (field_majority_masks != ground_truth_masks) & (ground_truth_masks != 0)
        correctness_mask[mismatches] = 1

        return correctness_mask

    @staticmethod
    def create_positive_negative_image(generated_images, target_images):
        """
        Create an image that shows the positive and negative classifications of the model.

        Args:
            generated_images (torch.Tensor): The images generated by the model.
            target_images (torch.Tensor): The target images.

        Returns:
            list: A list of processed images.
        """
        classification_masks = generated_images == target_images
        processed_imgs = []
        for mask, target in zip(classification_masks, target_images):
            # color the background white, right classifications green, wrong classifications red
            colored_img = torch.zeros((mask.shape[0], mask.shape[1], 3), dtype=torch.uint8)
            mask = mask.bool()  # Convert to boolean tensor
            colored_img[mask] = torch.tensor([0, 255, 0], dtype=torch.uint8)
            colored_img[~mask] = torch.tensor([255, 0, 0], dtype=torch.uint8)
            colored_img[target == 0] = torch.tensor([0, 0, 0], dtype=torch.uint8)
            processed_imgs.append(colored_img.cpu())
        return processed_imgs

    @staticmethod
    def process_images(imgs, max=None):
        """
        Process a batch of images to be logged on wandb.

        Args:
            imgs (torch.Tensor): A batch of images with shape (B, H, W) to be processed.
            max (float, optional): The maximum value to normalize the images. Defaults to None. If None, the maximum value in the batch is used.
        """
        if max is None:
            max = np.max(imgs.cpu().numpy())
        normalized_img = imgs / max
        processed_imgs = []
        for img in normalized_img.cpu().numpy():
            if max < 60:
                cmap = ListedColormap(plt.get_cmap('tab20').colors + plt.get_cmap('tab20b').colors + plt.get_cmap('tab20c').colors)
            else:
                cmap = plt.get_cmap('viridis')
            colored_img = cmap(img)
            colored_img[img == 0] = [0, 0, 0, 1]
            colored_img_uint8 = (colored_img[:, :, :3] * 255).astype(np.uint8)
            processed_imgs.append(colored_img_uint8)
        return processed_imgs