In [1]:
import torch
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import transformers
import datasets
import evaluate
import datasets
import huggingface_hub
import albumentations as A
from datasets import load_dataset
from transformers import TrainingArguments, Trainer
from huggingface_hub import notebook_login
import accelerate
from accelerate import Accelerator
from transformers import pipeline
from PIL import Image
from glob import glob
from transformers import SegformerImageProcessor, SegformerModel, SegformerConfig, AutoImageProcessor, SegformerForSemanticSegmentation
torch.backends.cuda.matmul.allow_tf32 = True
# This sets the model's huggingface URL for uploading and downloading the model
hf_model_name = "glacierscopessegmentation/glacier_segmentation_transformer"


1
2
3
4
5
5.1
5.2
5.3
6
7


In [None]:
# Call the notebook_login function to log in to Hugging Face's hub
# notebook_login()

# make sure to login, or use the huggingface-cli to login, that way you can actually sync the model
# the login for the dataset only works for Aashray and Brian soo... yeah! Email me to get access to the dataset and model at aashraychegu@hotmail.com

In [22]:
ds = load_dataset("glacierscopessegmentation/secondleg")

ds = ds["train"].train_test_split(.05)
train_ds = ds["train"]
test_ds = ds["test"]

id2label = {
    "0": "sky", # This is given by the rgb value of 00 00 00 for the mask
    "1": "surface-to-bed", # This is given by the rgb value of 01 01 01 for the mask
    "2": "bed-to-bottom", # This is given by the rgb value of 02 02 02 for the mask
}

id2label = {int(k): v for k, v in id2label.items()}
label2id = {v: k for k, v in id2label.items()}
num_labels = len(id2label)

len(train_ds), len(test_ds)


Found cached dataset parquet (C:/Users/aashr/.cache/huggingface/datasets/glacierscopessegmentation___parquet/glacierscopessegmentation--secondleg-718284968c2f234c/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/1 [00:00<?, ?it/s]

(8033, 423)

In [17]:
# Define the checkpoint from which to load the pre-trained model preprocessor
checkpoint = "nvidia/MiT-b0" # We need to use this processor for resizing the images from the dataset to the size expected by the model; the main problem with this is the output scaling for training and testing, so using the right prepreocessor is important

# These two models are for testing purposes and have a pretty bad preformance because they think that there ae 150 categories even though our task only has 3.
# Load the image processor from the pre-trained checkpoint
# image_processor = SegformerImageProcessor.from_pretrained(checkpoint)

# Load the Segformer model for semantic segmentation from the pre-trained checkpoint and move it to the GPU
# model = SegformerForSemanticSegmentation.from_pretrained(
    # checkpoint).to("cuda:0")

# Define the configuration for the test model, specifying the number of channels, labels, label-to-ID mapping, ID-to-label mapping, depths, hidden sizes, and decoder hidden size
test_config = SegformerConfig(num_channels=3, num_labels=num_labels, label2id=label2id,
                              id2label=id2label, depths=[2, 3, 4, 3], hidden_sizes=[64, 128, 320, 512], decoder_hidden_size=256*3)

# Load the image processor for the test model from the pre-trained checkpoint
test_image_processor = SegformerImageProcessor.from_pretrained(checkpoint)

# Create a Segformer model for semantic segmentation using the test configuration and move it to the GPU
test_model = SegformerForSemanticSegmentation(test_config).to("cuda:0")



Some weights of SegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/MiT-b0 and are newly initialized: ['decode_head.linear_c.3.proj.weight', 'decode_head.linear_fuse.weight', 'decode_head.classifier.weight', 'decode_head.batch_norm.bias', 'decode_head.linear_c.1.proj.bias', 'decode_head.linear_c.0.proj.bias', 'decode_head.linear_c.1.proj.weight', 'decode_head.linear_c.0.proj.weight', 'decode_head.batch_norm.running_mean', 'decode_head.linear_c.3.proj.bias', 'decode_head.linear_c.2.proj.weight', 'decode_head.batch_norm.num_batches_tracked', 'decode_head.batch_norm.running_var', 'decode_head.classifier.bias', 'decode_head.linear_c.2.proj.bias', 'decode_head.batch_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
transform = A.Compose(
    [
        A.ElasticTransform(p=0.3),
        A.GridDistortion(p=0.3),
        A.Perspective(p=0.3),
    ],
    additional_targets={"mask": "mask"},
)

# creates transforms for data augumentation, and using albumentations allows me apply the same transform to the image and the mask at the same time

In [24]:
# Define a function to apply transformations to a batch of training examples
def train_transforms(example_batch):
    imagesandmasks = [transform(image = np.array(image.convert("RGB")),mask = np.array(mask)) for image,mask in zip(example_batch["image"],example_batch["label"])]
    # applies the transform to the image and mask, but the data is stored as a list of dictionaries, so the next lines separate out the dicts into 2 different lists
    images = [i["image"] for i in imagesandmasks]
    masks = [i["mask"] for i in imagesandmasks]
    inputs = test_image_processor(images, masks)
    return inputs

# Define a function to apply transformations to a batch of validation examples
def val_transforms(example_batch):
    # Convert each image in the batch to RGB
    images = [x.convert("RGB") for x in example_batch["image"]]
    labels = [x for x in example_batch["label"]]
    inputs = test_image_processor(images, labels)
    return inputs

# this makes the transforms happen when a batch is loaded
train_ds.set_transform(train_transforms)
test_ds.set_transform(val_transforms)


In [None]:
# Load the "mean_iou" metric for evaluating semantic segmentation models
metric = evaluate.load("mean_iou")

# Define a function to compute metrics for evaluation predictions
# Here, the metric is mean intersection over union
def compute_metrics(eval_pred):
    # Ensure that gradient computation is turned off, as it is not needed for evaluation
    with torch.no_grad():
        # This computes the final logits tensor by interpolating the output logits to the size of the labels tensor from an input of size (batch_size, num_labels, height, width)
        # This is input that has gone through the model's forward pass
        logits, labels = eval_pred
        logits_tensor = torch.from_numpy(logits)
        # this can lead to very high ram usage for the upscaling
        logits_tensor = nn.functional.interpolate(
            logits_tensor,
            size=labels.shape[-2:],
            mode="bilinear",
            align_corners=False,
        )
        # Take the argmax of the logits tensor along dimension 1 to get the predicted labels
        logits_tensor = logits_tensor.argmax(dim=1)
        # Detach the predicted labels from the computation graph and move them to the CPU 
        # (although they are already on the CPU) to save memory and to use numpy features like the metrics module
        pred_labels = logits_tensor.detach().cpu().numpy()
        # Computes metrics
        metrics = metric.compute(
            predictions=pred_labels,
            references=labels,
            num_labels=num_labels,
            reduce_labels=False,
            ignore_index = 255
        )
        for key, value in metrics.items():
            if type(value) is np.ndarray:
                metrics[key] = value.tolist()
        # Return the computed metrics
        return metrics

In [None]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir="glacformer",  # The output directory for the model predictions and checkpoints
    learning_rate=6e-5,  # The initial learning rate for Adam
    num_train_epochs=1,  # Total number of training epochs to perform
    auto_find_batch_size=True,  # Whether to automatically find an appropriate batch size
    save_total_limit=3,  # Limit the total amount of checkpoints and delete the older checkpoints
    eval_accumulation_steps=0,  # Number of steps to accumulate gradients before performing a backward/update pass
    evaluation_strategy="epoch",  # The evaluation strategy to adopt during training
    save_strategy="epoch",  # The checkpoint save strategy to adopt during training
    save_steps=1,  # Number of updates steps before two checkpoint saves
    eval_steps=1,  # Number of update steps before two evaluations
    logging_steps=30,  # Number of update steps before logging learning rate and other metrics
    remove_unused_columns=False,  # Whether to remove columns not used by the model when using a dataset
    fp16=True,  # Whether to use 16-bit float precision instead of 32-bit for saving memory
    tf32=True,  # Whether to use tf32 precision instead of 32-bit for saving memory
    gradient_accumulation_steps=4,  # Number of updates steps to accumulate before performing a backward/update pass for saving memory
    hub_model_id = hf_model_name # The model ID on the Hugging Face model hub
)

# Define the trainer
trainer = Trainer(
    model=test_model,  
    args=training_args,  
    train_dataset=train_ds,
    eval_dataset=test_ds,  
    compute_metrics=compute_metrics,  
)

In [None]:
# Start the training process
trainer.train()

In [None]:
trainer.model.save_pretrained("glacformer")

# Create a repository object for the specified repository on Hugging Face's hub, cloning from the specified source
repo = huggingface_hub.Repository("glacformer", clone_from=hf_model_name)

repo.git_pull()
repo.push_to_hub()

In [None]:
# Define a function to combine images
def combine_images(images):
    # Convert images to HSV mode
    hsvimages = [img.convert('HSV') for img in images]

    # Define the hues for each image
    hues = [120, 200, 360]

    # Colorize each image with the corresponding hue
    for i, (img,limg) in enumerate(zip(hsvimages,images)):
        h, s, v = img.convert("HSV").split()
        h = h.point(lambda _: hues[i]) # actually changes the hue
        s = s.point(lambda _: 255)
        img = Image.merge('HSV', (h, s, v)).convert('RGBA')
        img.putalpha(limg)
        images[i] = img

    # Combine the images
    combined_image = Image.alpha_composite(images[0], images[1])
    combined_image = Image.alpha_composite(combined_image, images[2])

    return combined_image

# Define a class for the pipeline and dsiplay; allows me to save and reuse 
class glacformer():
    def __init__(self, pipeline=pipeline("image-segmentation",
                                         model=hf_model_name, image_processor="nvidia/MiT-b0"), image_list = glob("secondleg/*/cropped_images/*.png")) -> None:
        self.pipeline = pipeline
        self.image_list = image_list
    def __getitem__(self, index, alpha = 100):
        originals = [i["mask"] for i in self.pipeline(self.image_list[index])]
        segmap = combine_images(originals)
        segmap.putalpha(100)
        rgbaorig = Image.open(self.image_list[index]).convert("RGBA")
        rgbaorig.putalpha(255-alpha)
        return Image.alpha_composite(segmap,rgbaorig)
    def __len__(self):
        return len(self.image_list)
    def __iter__(self):
        for i in range(len(self)):
            yield self[i]
    def display(self, display):
        for i in evalmodel:
            display(i)
            if input("press enter to continue, anything else to stop") == "":
                continue
            else:
                break

evalmodel = glacformer()

from IPython.display import display

glacformer.display(display)