In [2]:
!wget https://huggingface.co/datasets/aoxo/photorealism-style-adapter-gta-v/resolve/main/cyberpunk-china-losangeles.json
!wget https://huggingface.co/datasets/aoxo/photorealism-style-adapter-gta-v/resolve/main/gtav-mapillary.json
!wget https://huggingface.co/datasets/aoxo/photorealism-style-adapter-gta-v/resolve/main/gtav_features.zip
!wget https://huggingface.co/datasets/aoxo/photorealism-style-adapter-gta-v/resolve/main/cyberpunk_features.zip
!wget https://huggingface.co/datasets/aoxo/photorealism-style-adapter-gta-v/resolve/main/realworld_features.zip
!wget https://huggingface.co/datasets/aoxo/photorealism-style-adapter-gta-v/resolve/main/mapillary_features.zip

--2024-10-06 11:55:03--  https://huggingface.co/datasets/aoxo/photorealism-style-adapter-gta-v/resolve/main/cyberpunk-china-losangeles.json
Resolving huggingface.co (huggingface.co)... 13.35.7.38, 13.35.7.5, 13.35.7.81, ...
Connecting to huggingface.co (huggingface.co)|13.35.7.38|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.hf.co/repos/98/f1/98f153cc73597dd81851aa830af167335f564ee495a5b75514c395f89203aa08/9a58f0a5e8a2fc10305947ba00a91dcf29a951e47b28e6c57067c78f45795904?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27cyberpunk-china-losangeles.json%3B+filename%3D%22cyberpunk-china-losangeles.json%22%3B&response-content-type=application%2Fjson&Expires=1728474903&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyODQ3NDkwM319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzk4L2YxLzk4ZjE1M2NjNzM1OTdkZDgxODUxYWE4MzBhZjE2NzMzNWY1NjRlZTQ5NWE1Yjc1NTE0YzM5NWY4OTIwM2FhMDgvOWE

In [159]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
!pip install einops
from einops import rearrange

# Patch Embedding with Dynamic Positional Encoding
class DynamicPatchEmbedding(nn.Module):
    def __init__(self, in_channels=2048, patch_size=8, emb_dim=768, img_size=256):
        super(DynamicPatchEmbedding, self).__init__()
        self.patch_size = patch_size
        self.proj = nn.Conv2d(in_channels, emb_dim, kernel_size=patch_size, stride=patch_size)
        self.num_patches = (img_size // patch_size) ** 2

    def forward(self, x):
        # If necessary, reshape the input to 4D as before
        if len(x.shape) == 2:  # Input is [batch_size, channels], so reshape it
            batch_size = x.shape[0]
            channels = 2048  # Assuming 2048 feature channels
            h = w = int(math.sqrt(x.shape[1] // channels))  # Infer height and width
            x = x.view(batch_size, channels, h, w)  # Reshape to [batch_size, channels, height, width]

        # Pass through Conv2d
        batch_size = x.shape[0]
        x = self.proj(x)  # (batch_size, emb_dim, H/P, W/P)
        x = x.flatten(2).transpose(1, 2)  # (batch_size, num_patches, emb_dim)
        return x

# Style Adaptive Layer Normalization (SALN)
class StyleAdaptiveLayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super(StyleAdaptiveLayerNorm, self).__init__()
        self.norm = nn.LayerNorm(emb_dim)
        self.fc = nn.Linear(emb_dim, emb_dim * 2)

    def forward(self, x, style):
        style = self.fc(style).unsqueeze(1)
        gamma, beta = style.chunk(2, dim=-1)
        normalized_x = self.norm(x)
        return gamma * normalized_x + beta

# Cross Attention Layer
class CrossAttentionLayer(nn.Module):
    def __init__(self, emb_dim, num_heads, dropout=0.1):
        super(CrossAttentionLayer, self).__init__()
        self.attn = nn.MultiheadAttention(embed_dim=emb_dim, num_heads=num_heads, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, context):
        attn_output, _ = self.attn(x, context, context)
        return self.dropout(attn_output)

# Transformer Encoder Block with SALN
class TransformerEncoderBlock(nn.Module):
    def __init__(self, emb_dim=768, num_heads=8, hidden_dim=2048, dropout=0.1):
        super(TransformerEncoderBlock, self).__init__()
        self.attn = CrossAttentionLayer(emb_dim, num_heads, dropout=dropout)
        self.ff = nn.Sequential(
            nn.Linear(emb_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, emb_dim),
        )
        self.norm1 = StyleAdaptiveLayerNorm(emb_dim)
        self.norm2 = StyleAdaptiveLayerNorm(emb_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, style):
        attn_output = self.attn(x, x)
        x = x + self.dropout(attn_output)
        x = self.norm1(x, style)

        ff_output = self.ff(x)
        x = x + self.dropout(ff_output)
        x = self.norm2(x, style)
        return x

# Transformer Decoder Block with SALN
class TransformerDecoderBlock(nn.Module):
    def __init__(self, emb_dim=768, num_heads=8, hidden_dim=2048, dropout=0.1):
        super(TransformerDecoderBlock, self).__init__()
        self.attn1 = CrossAttentionLayer(emb_dim, num_heads, dropout=dropout)
        self.attn2 = CrossAttentionLayer(emb_dim, num_heads, dropout=dropout)
        self.ff = nn.Sequential(
            nn.Linear(emb_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, emb_dim),
        )
        self.norm1 = StyleAdaptiveLayerNorm(emb_dim)
        self.norm2 = StyleAdaptiveLayerNorm(emb_dim)
        self.norm3 = StyleAdaptiveLayerNorm(emb_dim)

    def forward(self, x, enc_output, style):
        attn_output1 = self.attn1(x, x)
        x = x + attn_output1
        x = self.norm1(x, style)

        attn_output2 = self.attn2(x, enc_output)
        x = x + attn_output2
        x = self.norm2(x, style)

        ff_output = self.ff(x)
        x = x + ff_output
        x = self.norm3(x, style)

        return x

# Swin Transformer Block
class SwinTransformerBlock(nn.Module):
    def __init__(self, dim, num_heads, window_size=7, shift_size=2):
        super(SwinTransformerBlock, self).__init__()
        self.attn = nn.MultiheadAttention(embed_dim=dim, num_heads=num_heads, batch_first=True)
        self.mlp = nn.Sequential(
            nn.Linear(dim, 4 * dim),
            nn.GELU(),
            nn.Linear(4 * dim, dim)
        )
        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x, _ = self.attn(x, x, x)
        x = shortcut + x

        shortcut = x
        x = self.norm2(x)
        x = self.mlp(x)
        x = shortcut + x

        return x

# Refinement Block
class RefinementBlock(nn.Module):
    def __init__(self, in_channels=768, out_channels=3, kernel_size=3, stride=1, padding=1):
        super(RefinementBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
        self.bn = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        return x

# Main ViT Image-to-Image Model with SALN
class RealFormerv3(nn.Module):
    def __init__(self, img_size=512, patch_size=8, emb_dim=768, num_heads=12, num_layers=12, hidden_dim=3072, window_size=8):
        super(RealFormerv3, self).__init__()
        self.patch_embed = DynamicPatchEmbedding(in_channels=2048, patch_size=patch_size, emb_dim=emb_dim, img_size=img_size)

        self.encoder_layers = nn.ModuleList([TransformerEncoderBlock(emb_dim, num_heads, hidden_dim) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([TransformerDecoderBlock(emb_dim, num_heads, hidden_dim) for _ in range(num_layers)])
        self.swin_layers = nn.ModuleList([SwinTransformerBlock(emb_dim, num_heads, window_size) for _ in range(num_layers)])

        self.refinement = RefinementBlock(in_channels=emb_dim, out_channels=3)
        self.final_layer = nn.Conv2d(3, 2048, kernel_size=1)  # Adjust the input channels to 3

        # Style encoder
        self.style_encoder = nn.Sequential(
            nn.Conv2d(2048, emb_dim, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
            nn.Linear(emb_dim, emb_dim)
        )

    def forward(self, content, style):
        # Patch embedding and transformer encoder-decoder process the content
        x = self.patch_embed(content)

        # Reshape style to 4D if necessary
        if len(style.shape) == 2:  # Input is [batch_size, channels], so reshape it
            batch_size = style.shape[0]
            channels = 2048  # Assuming 2048 feature channels
            h = w = int(math.sqrt(style.shape[1] // channels))  # Infer height and width
            style = style.view(batch_size, channels, h, w)  # Reshape to [batch_size, channels, height, width]

        style_features = self.style_encoder(style)

        # Transformer encoder with SALN
        for encoder in self.encoder_layers:
            x = encoder(x, style_features)

        # Transformer decoder with SALN
        for decoder in self.decoder_layers:
            x = decoder(x, x, style_features)  # Using self-attention for now

        # Swin Transformer processing
        for swin in self.swin_layers:
            x = swin(x)

        # Reshape x back to 4D (batch_size, channels, height, width) before passing it to Conv2d layers
        batch_size, num_patches, emb_dim = x.shape
        h = w = int(math.sqrt(num_patches))  # Assuming square patches
        x = x.transpose(1, 2).view(batch_size, emb_dim, h, w)

        # Final refinement and output layer
        x = self.refinement(x)
        x = self.final_layer(x)
        return x

# Loss functions remain the same
def total_variation_loss(x):
    return torch.sum(torch.abs(x[:, :, :-1, :] - x[:, :, 1:, :])) + torch.sum(torch.abs(x[:, :, :, :-1] - x[:, :, :, 1:]))

def combined_loss(output, target):
    l1_loss = nn.L1Loss()(output, target)
    tv_loss = total_variation_loss(output)
    return l1_loss + 0.0001 * tv_loss

def psnr(img1, img2):
    mse = torch.mean((img1 - img2) ** 2)
    if mse == 0:
        return float('inf')
    return 20 * torch.log10(1.0 / torch.sqrt(mse))



In [154]:
from torch.utils.data import Dataset, DataLoader
import json

class FeatureMapDataset(Dataset):
    def __init__(self, frames_dir, real_dir, json_file):
        self.frames_dir = frames_dir
        self.real_dir = real_dir

        with open(json_file, 'r') as f:
            self.mappings = json.load(f)

        self.frame_files = list(self.mappings.keys())  # List of frame filenames

    def __len__(self):
        return len(self.frame_files)

    def __getitem__(self, idx):
        frame_file = self.frame_files[idx]
        real_images = self.mappings[frame_file]

        # Load frame feature map
        frame_feature = torch.load(os.path.join(self.frames_dir, frame_file))

        # Load top real world image feature maps
        real_features = [torch.load(os.path.join(self.real_dir, img[0])) for img in real_images]

        # Extract the top real image and its similarity score
        top_real_feature = real_features[0]
        top_similarity = real_images[0][1]

        return frame_feature, top_real_feature, top_similarity, real_features[1:]

# Define data loaders
frames_dir = '/kaggle/working/frames_features/'
real_dir = '/kaggle/working/real_features/'
json_file = '/kaggle/working/*.json'  # or gtav_mapillary.json

dataset = FeatureMapDataset(frames_dir, real_dir, json_file)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=4)

In [151]:
import torch
torch.cuda.empty_cache()

In [None]:
import os

def setup_distributed():
    dist.init_process_group(backend='nccl')
    torch.cuda.set_device(args.local_rank)

def contrastive_loss(anchor, positive, negatives, margin=0.2):
    # Cosine similarity between anchor and positive (
    pos_sim = F.cosine_similarity(anchor, positive, dim=-1)

    # Cosine similarity between anchor and all negative examples
    neg_sims = [F.cosine_similarity(anchor, neg, dim=-1) for neg in negatives]

    # Calculate loss
    loss = 0.0
    for neg_sim in neg_sims:
        loss += torch.clamp(margin + neg_sim - pos_sim, min=0.0)  # Margin-based contrastive loss

    return loss.mean()

# Training script
def train_contrastive(model, dataloader, optimizer, num_epochs=10, margin=0.2):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = nn.DataParallel(model, device_ids = [0,1])
    model.to(device)

    best_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        running_psnr = 0.0

        for batch_idx, (frame_feature, top_real_feature, top_similarity, other_real_features) in enumerate(dataloader):
            frame_feature = frame_feature.to(device)
            top_real_feature = top_real_feature.to(device)
            other_real_features = [neg.to(device) for neg in other_real_features]

            optimizer.zero_grad()

            # Forward pass
            output = model(frame_feature, top_real_feature)

            # Compute contrastive loss
            loss = contrastive_loss(output, top_real_feature, other_real_features, margin=margin)

            # Backpropagation and optimization
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            # PSNR metric computation
            psnr_value = psnr(output, top_real_feature)
            running_psnr += psnr_value

            # Print training status
            if batch_idx % 10 == 0:
                print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx}/{len(dataloader)}], Loss: {loss.item()}, PSNR: {psnr_value:.4f}")

        # Epoch-level metrics
        epoch_loss = running_loss / len(dataloader)
        avg_psnr = running_psnr / len(dataloader)

        print(f"Epoch [{epoch+1}/{num_epochs}], Avg Loss: {epoch_loss:.4f}, Avg PSNR: {avg_psnr:.4f}")

        # Save the best model
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            torch.save(model.state_dict(), 'realformerv3,pth')
            print(f"Model saved at epoch {epoch+1} with loss {best_loss:.4f}")

# Optimizer setup
model = RealFormerv3(img_size=256, patch_size=1, emb_dim=768, num_heads=42, num_layers=16, hidden_dim=3072)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Start training
train_contrastive(model, dataloader, optimizer, num_epochs=50, margin=0.2)

In [174]:
import os
from huggingface_hub import login, HfApi

# Login to Hugging Face Hub
login(token="")

# Initialize the Hugging Face API
api = HfApi()

# Specify the directory containing the models
model_directory = "/kaggle/working/"
repo_id = "aoxo/RealFormer"
repo_type = "model"

# Loop through all files in the model directory
for filename in os.listdir(model_directory):
    # Only upload files that end with .pth
    if filename.endswith(".pth"):
        file_path = os.path.join(model_directory, filename)
        path_in_repo = filename  # Use the same filename in the repo
        
        # Upload the model file to the repository
        api.upload_file(
            path_or_fileobj=file_path,
            path_in_repo=path_in_repo,
            repo_id=repo_id,
            repo_type=repo_type,
        )
        print(f"Uploaded {filename} to {repo_id} repository.")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful
Uploaded realformerv3.pth to aoxo/RealFormer repository.


realformerv3_fp16.pth:   0%|          | 0.00/505M [00:00<?, ?B/s]

Uploaded realformerv3_fp16.pth to aoxo/RealFormer repository.


realformerv3_bf16.pth:   0%|          | 0.00/505M [00:00<?, ?B/s]

Uploaded realformerv3_bf16.pth to aoxo/RealFormer repository.


realformerv3_int8.pth:   0%|          | 0.00/344M [00:00<?, ?B/s]

Uploaded realformerv3_int8.pth to aoxo/RealFormer repository.


In [177]:
total_params = sum(p.numel() for p in model.parameters())
print(total_params)
print(model)
torch.save(model.state_dict(), 'realformerv3.pth')
# Convert model to FP16 and save
model.half()
torch.save(model.state_dict(), 'realformerv3_fp16.pth')
# Convert model to BF16 and save
model.to(torch.bfloat16)
torch.save(model.state_dict(), 'realformerv3_bf16.pth')
import torch.quantization as quantization

# Apply static quantization to the model
model_int8 = quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

# Save the INT8 quantized model
torch.save(model_int8.state_dict(), 'realformerv3_int8.pth')

252617225
RealFormerv3(
  (patch_embed): DynamicPatchEmbedding(
    (proj): Conv2d(2048, 768, kernel_size=(1, 1), stride=(1, 1))
  )
  (encoder_layers): ModuleList(
    (0-7): 8 x TransformerEncoderBlock(
      (attn): CrossAttentionLayer(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): Sequential(
        (0): Linear(in_features=768, out_features=3072, bias=True)
        (1): ReLU()
        (2): Linear(in_features=3072, out_features=768, bias=True)
      )
      (norm1): StyleAdaptiveLayerNorm(
        (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (fc): Linear(in_features=768, out_features=1536, bias=True)
      )
      (norm2): StyleAdaptiveLayerNorm(
        (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (fc): Linear(in_features=768, out_features=1536, bias=True)
      )
 