Spaces:

Jannat24
/

uncovering_deepfake_image

Sleeping

App Files Files Community

Jannat24 commited on Mar 16

Commit

c9224f7

verified ·

1 Parent(s): b9119c8

2025_march16

Browse files

Files changed (15) hide show

app.py +128 -0
images/df1.jpg +0 -0
images/df2.jpg +0 -0
images/df3.jpg +0 -0
images/df4.jpg +0 -0
modules/.ipynb_checkpoints/denormalize-checkpoint.py +21 -0
modules/.ipynb_checkpoints/finetunedvqgan-checkpoint.py +31 -0
modules/.ipynb_checkpoints/frameworkeval-checkpoint.py +56 -0
modules/.ipynb_checkpoints/modelz-checkpoint.py +155 -0
modules/.ipynb_checkpoints/segmentface-checkpoint.py +75 -0
modules/denormalize.py +21 -0
modules/finetunedvqgan.py +31 -0
modules/frameworkeval.py +56 -0
modules/modelz.py +155 -0
modules/segmentface.py +75 -0

app.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import io
+import os
+import shutil
+import requests
+import numpy as np
+from PIL import Image, ImageOps
+import math
+import matplotlib.pyplot as plt
+import pickle
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.transforms as T
+import torchvision.transforms.functional as TF
+from torch.utils.checkpoint import checkpoint
+from torchvision.models import vgg16
+from torchmetrics.image.fid import FrechetInceptionDistance
+from torchmetrics.functional import structural_similarity_index_measure
+from facenet_pytorch import InceptionResnetV1
+from taming.models.vqgan import VQModel
+from omegaconf import OmegaConf
+from taming.models.vqgan import GumbelVQ
+import gradio as gr
+from modules.finetunedvqgan import Generator
+from modules.modelz import DeepfakeToSourceTransformer
+from modules.frameworkeval import DF
+from modules.segmentface import FaceSegmenter
+from modules.denormalize import denormalize_bin, denormalize_tr, denormalize_ar
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+##________________________Transformation______________________________
+transform = T.Compose([
+    T.Resize((256, 256)),
+    T.ToTensor(),
+    T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])])  # Normalize to [-1, 1]
+#_________________Define:Gradio Function________________________
+def gen_sources(deepfake_img):
+    #----------------DeepFake Face Segmentation-----------------
+    deepfake_seg = segmenter.segment_face(deepfake_img)
+    config_path = "./models/config.yaml"
+    #------------Initialize:Decoder-F------------------------
+    checkpoint_path_f = "./models/model_vaq1_ff.pth"
+    checkpoint_f = torch.load(checkpoint_path_f, map_location=device)
+    model_vaq_f = Generator(config_path, device)
+    model_vaq_f = model_vaq_f.load_state_dict(checkpoint_f, strict=True)
+    model_vaq_f.eval()
+    #------------Initialize:Decoder-G------------------------
+    checkpoint_path_g = "./models/model_vaq2_gg.pth"
+    checkpoint_g = torch.load(checkpoint_path_g, map_location=device)
+    model_vaq_g = Generator(config_path, device)
+    model_vaq_g = model_vaq_g.load_state_dict(checkpoint_g, strict=True)
+    model_vaq_g.eval()
+    ##------------------------Initialize Model-F-------------------------------------
+    model_z1 = DeepfakeToSourceTransformer().to(device)
+    model_z1.load_state_dict(torch.load("./models/model_z1_ff.pth",map_location=device),strict=True)
+    model_z1.eval()
+    ##------------------------Initialize Model-G-------------------------------------
+    model_z2 = DeepfakeToSourceTransformer().to(device)
+    model_z2.load_state_dict(torch.load("./models/model_z2_gg.pth",map_location=device),strict=True)
+    model_z2.eval()
+    ##--------------------Initialize:Evaluation---------------------------------------
+    criterion = DF()
+    ##----------------------Initialize:Face Segmentation----------------------------------
+    segmenter = FaceSegmenter(threshold=0.5)
+    ##----------------------Operation-------------------------------------------------
+    with torch.no_grad():
+        # Load and preprocess input image
+        img = Image.open(deepfake_img).convert('RGB')
+        segimg = Image.open(deepfake_seg).convert('RGB')
+        df_img = transform(img).unsqueeze(0).to(device)  # Shape: (1, 3, 256, 256)
+        seg_img = transform(segimg).unsqueeze(0).to(device)
+        # Calculate quantized_block for all images
+        z_df, _, _ = model_vaq_f.encode(df_img)
+        z_seg, _, _ = model_vaq_g.encode(seg_img)
+        rec_z_img1 = model_z1(z_df)
+        rec_z_img2 = model_z2(z_seg)
+        rec_img1 = model_vaq_f.decode(rec_z_img1)
+        rec_img2 = model_vaq_g.decode(rec_z_img2)
+        rec_img1 = rec_img1.squeeze(0)
+        rec_img2 = rec_img2.squeeze(0)
+        rec_img1_pil = T.ToPILImage()(rec_img1)
+        rec_img2_pil = T.ToPILImage()(rec_img2)
+        # Save PIL images to in-memory buffers
+        buffer1 = BytesIO()
+        buffer2 = BytesIO()
+        rec_img1_pil.save(buffer1, format="PNG")
+        rec_img2_pil.save(buffer2, format="PNG")
+        # Pass buffers to Gradio client
+        result = client.predict(
+            target=file(buffer1),
+            source=file(buffer2), slider=100, adv_slider=100,
+            settings=["Adversarial Defense"], api_name="/run_inference"
+        )
+        # Load result and compute loss
+        dfimage_pil = Image.open(result)  # Open the resulting image
+        buffer3 = BytesIO()
+        dfimage_pil.save(buffer3, format="PNG")
+        rec_df = transform(Image.open(buffer3)).unsqueeze(0).to(device)
+        rec_loss,_ = criterion(df_img, rec_df)
+        return (rec_img1_pil, rec_img2_pil, dfimage_pil, round(rec_loss.item(),3))
+#________________________Create the Gradio interface_________________________________
+interface = gr.Interface(
+    fn=gen_sources,
+    inputs=gr.Image(type="pil", label="Input Image"),
+    outputs=[
+        gr.Image(type="pil", label="Recovered Source Image 1 (Target Image)"),
+        gr.Image(type="pil", label="Recovered Source Image 2 (Source Image)"),
+        gr.Image(type="pil", label="Reconstructed Deepfake Image"),
+        gr.Number(label="Reconstruction Loss")
+    ],
+    examples = ["./images/df1.jpg","./images/df2.jpg","./images/df3.jpg","./images/df4.jpg"],
+    theme = gr.themes.Soft(),
+    title="Uncovering Deepfake Image for Identifying Source Images",
+    description="Upload an DeepFake image.",
+)
+interface.launch(debug=True)

images/df1.jpg ADDED Viewed

images/df2.jpg ADDED Viewed

images/df3.jpg ADDED Viewed

images/df4.jpg ADDED Viewed

modules/.ipynb_checkpoints/denormalize-checkpoint.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import numpy as np
+import torch
+import math
+#------------------Denormalization---------------------------------------------
+def denormalize_bin(tensor):
+    tr = torch.clamp(tensor, -1., 1.)  # Clamp the values between -1 and 1
+    tr = tr.add(1).div(2)  # Shift to [0, 1]
+    return tr
+def denormalize_tr(tensor):
+    tr = torch.clamp(tensor, -1., 1.)  # Clamp the values between -1 and 1
+    tr = tr.add(1).div(2).mul(255)  # Shift to [0, 1] and scale to [0, 255]
+    tr = tr.byte()  # Convert the tensor to uint8
+    return tr
+def denormalize_ar(tensor):
+    tr = torch.clamp(tensor, -1., 1.)  # Clamp the values between -1 and 1
+    tr = tr.add(1).div(2).mul(255)  # Shift to [0, 1] and scale to [0, 255]
+    tr = tr.byte()  # Convert the tensor to uint8
+    arr = tr.permute(0, 2, 3, 1).cpu().detach().numpy()  # Convert to (N, H, W, C) and numpy array
+    return arr

modules/.ipynb_checkpoints/finetunedvqgan-checkpoint.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import torch
+from torch.utils.checkpoint import checkpoint
+from taming.models.vqgan import VQModel
+from omegaconf import OmegaConf
+from taming.models.vqgan import GumbelVQ
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class Generator:
+    def __init__(self, config_path, device=device):
+        self.config_path = config_path
+        self.device = device
+    def load_models(self):
+        # Load configuration
+        config = OmegaConf.load(self.config_path)
+        # Extract parameters specific to GumbelVQ
+        vq_params = config.model.params
+        # Initialize the GumbelVQ models
+        model_vaq = GumbelVQ(
+            ddconfig=vq_params.ddconfig,
+            lossconfig=vq_params.lossconfig,
+            n_embed=vq_params.n_embed,
+            embed_dim=vq_params.embed_dim,
+            kl_weight=vq_params.kl_weight,
+            temperature_scheduler_config=vq_params.temperature_scheduler_config,
+        ).to(self.device)
+        return model_vaq

modules/.ipynb_checkpoints/frameworkeval-checkpoint.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.models import vgg16
+from torchmetrics.functional import structural_similarity_index_measure
+from facenet_pytorch import InceptionResnetV1
+from denormalize import denormalize_bin, denormalize_tr, denormalize_ar
+class DF(nn.Module):
+    def __init__(self):
+        super(DF, self).__init__()
+        self.mse_weight = 0.25
+        self.perceptual_weight = 0.25
+        self.ssim_weight = 0.25
+        self.idsim_weight = 0.25
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.vgg = vgg16(pretrained=True).features[:16].to(device).eval()
+        self.facenet = InceptionResnetV1(pretrained='vggface2').to(device).eval()
+        for param in self.facenet.parameters():
+            param.requires_grad = False  # Freeze the model
+        self.cosloss = nn.CosineEmbeddingLoss()
+    def perceptual_loss(self, real, fake):
+        with torch.no_grad():  # VGG is frozen during training
+            real_features = self.vgg(real)
+            fake_features = self.vgg(fake)
+        return F.mse_loss(real_features, fake_features)
+    def idsimilarity(self, real, fake):
+        with torch.no_grad():
+            # Extract embeddings
+            input_embed = self.facenet(real).to(device)
+            generated_embed = self.facenet(fake).to(device)
+        # Compute cosine similarity loss
+        target = torch.ones(input_embed.size(0)).to(real.device)  # Target = 1 (maximize similarity)
+        return self.cosloss(input_embed, generated_embed, target)
+    def forward(self, r, f):
+        real = denormalize_bin(r) #[-1,1] to [0,1]
+        fake = denormalize_bin(f)
+        mse_loss = F.mse_loss(real, fake)
+        perceptual_loss = self.perceptual_loss(real, fake)
+        idsim_loss = self.idsimilarity(real, fake)
+        ssim = structural_similarity_index_measure(fake, real)
+        ssim_loss = 1 - ssim
+        id_si = 1 - idsim_loss
+        total_loss = (self.mse_weight * mse_loss) + (self.perceptual_weight * perceptual_loss) + (self.idsim_weight * idsim_loss) + (self.ssim_weight * ssim_loss)
+        components = {
+            "MSE Loss": mse_loss.item(),
+            "Perceptual Loss": perceptual_loss.item(),
+            "ID-SIM Loss": idsim_loss.item(),
+            "SSIM Loss": ssim_loss.item()
+        }
+        return total_loss, components

modules/.ipynb_checkpoints/modelz-checkpoint.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import numpy as np
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+##_____________________Define:MODEL-F & MODEL-G_________________
+# Positional Encoding
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_len=1024):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(0.1)
+        position = torch.arange(max_len).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe = torch.zeros(max_len, d_model)
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        self.register_buffer('pe', pe.unsqueeze(0))  # (1, max_len, d_model)
+    def forward(self, x):
+        x = x + self.pe[:, :x.size(1)]
+        return self.dropout(x)
+# Transformer Encoder
+class TransformerEncoder(nn.Module):
+    def __init__(self, d_model=256, nhead=8, num_layers=6, dim_feedforward=1024, dropout=0.1):
+        super(TransformerEncoder, self).__init__()
+        self.positional_encoding = PositionalEncoding(d_model)
+        self.encoder = nn.TransformerEncoder(
+            nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout,batch_first=True),
+            num_layers=num_layers
+        )
+    def preprocess_latent(self, Z):
+        batch_size, channels, height, width = Z.shape # (batch_size, 256, 32, 32)
+        seq_len = height * width
+        Z = Z.permute(0, 2, 3, 1).reshape(batch_size, seq_len, channels)  # (batch_size, 1024, 256)
+        return Z
+    def postprocess_latent(self, Z):
+        batch_size, seq_len, channels = Z.shape  # (batch_size, 1024, 256)
+        height = width = int(math.sqrt(seq_len))
+        Z = Z.reshape(batch_size, height, width, channels).permute(0, 3, 1, 2)  # (batch_size, 256, 32, 32)
+        return Z
+    def forward(self, Z):
+        Z = self.preprocess_latent(Z)
+        Z = self.positional_encoding(Z)
+        Z = self.encoder(Z)
+        Z = self.postprocess_latent(Z)
+        return Z # latent of transformer
+class TransformerDecoder(nn.Module):
+    def __init__(self, d_model=256, nhead=8, num_layers=12, dim_feedforward=1024, dropout=0.1):
+        super().__init__()
+        self.d_model = d_model
+        # Enhanced positional encoding
+        self.positional_encoding = PositionalEncoding(d_model)
+        # Multi-layer learnable start tokens
+        self.base_start = nn.Parameter(torch.randn(1, 1024, d_model))
+        self.start_net = nn.Sequential(
+            nn.LayerNorm(d_model),
+            nn.Linear(d_model, dim_feedforward),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(dim_feedforward, d_model),
+            nn.LayerNorm(d_model)
+        )
+        # Context-aware transformer decoder
+        self.decoder = nn.TransformerDecoder(
+            nn.TransformerDecoderLayer(
+                d_model=d_model,
+                nhead=nhead,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                batch_first=True
+            ),
+            num_layers=num_layers
+        )
+        # Output projection with residual
+        self.output_layer = nn.Sequential(
+            nn.Linear(d_model, d_model*2),
+            nn.GELU(),
+            nn.Linear(d_model*2, d_model))
+        self.init_weights()
+    def init_weights(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        nn.init.normal_(self.base_start, mean=0, std=0.02)
+    def preprocess_latent(self, Z):
+        # Convert (B, C, H, W) to (B, H*W, C)
+        return Z.permute(0, 2, 3, 1).flatten(1, 2)
+    def postprocess_latent(self, Z):
+        # Convert (B, H*W, C) back to (B, C, H, W)
+        B, L, C = Z.shape
+        H = W = int(L**0.5)
+        return Z.view(B, H, W, C).permute(0, 3, 1, 2)
+    def forward(self, Z, Z1_start_tokens=None, teacher_forcing_ratio=0.5):
+        # Process input latent
+        Z = self.preprocess_latent(Z)
+        #Z = self.positional_encoding(Z)
+        # Generate enhanced start tokens
+        B = Z.size(0)
+        base_tokens = self.base_start.expand(B, -1, -1)
+        processed_start = self.start_net(base_tokens)
+        # Teacher forcing integration
+        if Z1_start_tokens is not None and teacher_forcing_ratio > 0:
+            Z1_processed = self.positional_encoding(self.preprocess_latent(Z1_start_tokens))
+            # Create mixing mask
+            mask = torch.rand(B, 1, 1, device=Z.device) < teacher_forcing_ratio
+            processed_start = torch.where(mask, Z1_processed, processed_start)
+        # Decoder processing with residual
+        decoder_input = self.positional_encoding(processed_start)
+        outputs = self.decoder(decoder_input, Z)
+        outputs = self.output_layer(outputs + decoder_input)
+        return self.postprocess_latent(outputs)
+class DeepfakeToSourceTransformer(nn.Module):
+    def __init__(self, d_model=256, encoder_nhead=8, decoder_nhead=8, num_encoder_layers=6, num_decoder_layers=12, dim_feedforward=1024, dropout=0.1):
+        super().__init__()
+        self.encoder = TransformerEncoder(
+            d_model=d_model,
+            nhead=encoder_nhead,
+            num_layers=num_encoder_layers,
+            dim_feedforward=1024,
+            dropout=dropout
+        )
+        self.decoder = TransformerDecoder(
+            d_model=d_model,
+            nhead=decoder_nhead,
+            num_layers=num_decoder_layers,
+            dim_feedforward=dim_feedforward,
+            dropout=dropout
+        )
+    def forward(self, Z, Z1_start_tokens=None, teacher_forcing_ratio=0.5):
+        memory = self.encoder(Z)
+        Z1 = self.decoder(memory, Z1_start_tokens, teacher_forcing_ratio=teacher_forcing_ratio)
+        return Z1

modules/.ipynb_checkpoints/segmentface-checkpoint.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import cv2
+import mediapipe as mp
+import numpy as np
+from rembg import remove
+from PIL import Image
+class FaceSegmenter:
+    def __init__(self, threshold=0.5):
+        self.threshold = threshold
+        # Initialize face detection
+        self.face_detection = mp.solutions.face_detection.FaceDetection(
+            model_selection=1,  # 1 for general use, 0 for close-up faces
+            min_detection_confidence=0.5
+        )
+        # Initialize selfie segmentation (for background removal)
+        self.selfie_segmentation = mp.solutions.selfie_segmentation.SelfieSegmentation(
+            model_selection=1  # 1 for general use, 0 for close-up faces
+        )
+    def segment_face(self, image_path):
+        # Load the image
+        image = cv2.imread(image_path)
+        if image is None:
+            raise ValueError("Image not found or unable to load.")
+        # Convert to RGB (MediaPipe requires RGB input)
+        rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        # Step 1: Detect the face
+        face_results = self.face_detection.process(rgb_image)
+        if not face_results.detections:
+            # Use rembg to remove the background
+            with open(image_path, "rb") as input_file:
+                input_image = input_file.read()
+                output_image = remove(input_image)
+            # Convert the output image to a numpy array
+            output_image = np.array(Image.open(io.BytesIO(output_image)))
+            # Convert RGBA to RGB (remove alpha channel)
+            if output_image.shape[2] == 4:
+                output_image = cv2.cvtColor(output_image, cv2.COLOR_RGBA2RGB)
+            return output_image
+        # Get the bounding box of the first detected face
+        detection = face_results.detections[0]
+        bboxC = detection.location_data.relative_bounding_box
+        h, w, _ = image.shape
+        x, y, width, height = int(bboxC.xmin * w), int(bboxC.ymin * h), \
+                              int(bboxC.width * w), int(bboxC.height * h)
+        # Step 2: Segment the foreground (selfie segmentation)
+        segmentation_results = self.selfie_segmentation.process(rgb_image)
+        if segmentation_results.segmentation_mask is None:
+            raise ValueError("Segmentation failed.")
+        # Create a binary mask
+        mask = (segmentation_results.segmentation_mask > self.threshold).astype(np.uint8)
+        # Step 3: Crop the face using the bounding box
+        face_mask = np.zeros_like(mask)
+        face_mask[y:y+height, x:x+width] = mask[y:y+height, x:x+width]
+        # Apply the mask to the original image
+        segmented_face = cv2.bitwise_and(image, image, mask=face_mask)
+        return segmented_face
+    def save_segmented_face(self, image_path, output_path):
+        segmented_face = self.segment_face(image_path)
+        cv2.imwrite(output_path, segmented_face)
+    def show_segmented_face(self, image_path):
+        segmented_face = self.segment_face(image_path)
+        cv2.imshow("Segmented Face", segmented_face)
+        cv2.waitKey(0)
+        cv2.destroyAllWindows()

modules/denormalize.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import numpy as np
+import torch
+import math
+#------------------Denormalization---------------------------------------------
+def denormalize_bin(tensor):
+    tr = torch.clamp(tensor, -1., 1.)  # Clamp the values between -1 and 1
+    tr = tr.add(1).div(2)  # Shift to [0, 1]
+    return tr
+def denormalize_tr(tensor):
+    tr = torch.clamp(tensor, -1., 1.)  # Clamp the values between -1 and 1
+    tr = tr.add(1).div(2).mul(255)  # Shift to [0, 1] and scale to [0, 255]
+    tr = tr.byte()  # Convert the tensor to uint8
+    return tr
+def denormalize_ar(tensor):
+    tr = torch.clamp(tensor, -1., 1.)  # Clamp the values between -1 and 1
+    tr = tr.add(1).div(2).mul(255)  # Shift to [0, 1] and scale to [0, 255]
+    tr = tr.byte()  # Convert the tensor to uint8
+    arr = tr.permute(0, 2, 3, 1).cpu().detach().numpy()  # Convert to (N, H, W, C) and numpy array
+    return arr

modules/finetunedvqgan.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import torch
+from torch.utils.checkpoint import checkpoint
+from taming.models.vqgan import VQModel
+from omegaconf import OmegaConf
+from taming.models.vqgan import GumbelVQ
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class Generator:
+    def __init__(self, config_path, device=device):
+        self.config_path = config_path
+        self.device = device
+    def load_models(self):
+        # Load configuration
+        config = OmegaConf.load(self.config_path)
+        # Extract parameters specific to GumbelVQ
+        vq_params = config.model.params
+        # Initialize the GumbelVQ models
+        model_vaq = GumbelVQ(
+            ddconfig=vq_params.ddconfig,
+            lossconfig=vq_params.lossconfig,
+            n_embed=vq_params.n_embed,
+            embed_dim=vq_params.embed_dim,
+            kl_weight=vq_params.kl_weight,
+            temperature_scheduler_config=vq_params.temperature_scheduler_config,
+        ).to(self.device)
+        return model_vaq

modules/frameworkeval.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.models import vgg16
+from torchmetrics.functional import structural_similarity_index_measure
+from facenet_pytorch import InceptionResnetV1
+from denormalize import denormalize_bin, denormalize_tr, denormalize_ar
+class DF(nn.Module):
+    def __init__(self):
+        super(DF, self).__init__()
+        self.mse_weight = 0.25
+        self.perceptual_weight = 0.25
+        self.ssim_weight = 0.25
+        self.idsim_weight = 0.25
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.vgg = vgg16(pretrained=True).features[:16].to(device).eval()
+        self.facenet = InceptionResnetV1(pretrained='vggface2').to(device).eval()
+        for param in self.facenet.parameters():
+            param.requires_grad = False  # Freeze the model
+        self.cosloss = nn.CosineEmbeddingLoss()
+    def perceptual_loss(self, real, fake):
+        with torch.no_grad():  # VGG is frozen during training
+            real_features = self.vgg(real)
+            fake_features = self.vgg(fake)
+        return F.mse_loss(real_features, fake_features)
+    def idsimilarity(self, real, fake):
+        with torch.no_grad():
+            # Extract embeddings
+            input_embed = self.facenet(real).to(device)
+            generated_embed = self.facenet(fake).to(device)
+        # Compute cosine similarity loss
+        target = torch.ones(input_embed.size(0)).to(real.device)  # Target = 1 (maximize similarity)
+        return self.cosloss(input_embed, generated_embed, target)
+    def forward(self, r, f):
+        real = denormalize_bin(r) #[-1,1] to [0,1]
+        fake = denormalize_bin(f)
+        mse_loss = F.mse_loss(real, fake)
+        perceptual_loss = self.perceptual_loss(real, fake)
+        idsim_loss = self.idsimilarity(real, fake)
+        ssim = structural_similarity_index_measure(fake, real)
+        ssim_loss = 1 - ssim
+        id_si = 1 - idsim_loss
+        total_loss = (self.mse_weight * mse_loss) + (self.perceptual_weight * perceptual_loss) + (self.idsim_weight * idsim_loss) + (self.ssim_weight * ssim_loss)
+        components = {
+            "MSE Loss": mse_loss.item(),
+            "Perceptual Loss": perceptual_loss.item(),
+            "ID-SIM Loss": idsim_loss.item(),
+            "SSIM Loss": ssim_loss.item()
+        }
+        return total_loss, components

modules/modelz.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import numpy as np
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+##_____________________Define:MODEL-F & MODEL-G_________________
+# Positional Encoding
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_len=1024):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(0.1)
+        position = torch.arange(max_len).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe = torch.zeros(max_len, d_model)
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        self.register_buffer('pe', pe.unsqueeze(0))  # (1, max_len, d_model)
+    def forward(self, x):
+        x = x + self.pe[:, :x.size(1)]
+        return self.dropout(x)
+# Transformer Encoder
+class TransformerEncoder(nn.Module):
+    def __init__(self, d_model=256, nhead=8, num_layers=6, dim_feedforward=1024, dropout=0.1):
+        super(TransformerEncoder, self).__init__()
+        self.positional_encoding = PositionalEncoding(d_model)
+        self.encoder = nn.TransformerEncoder(
+            nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout,batch_first=True),
+            num_layers=num_layers
+        )
+    def preprocess_latent(self, Z):
+        batch_size, channels, height, width = Z.shape # (batch_size, 256, 32, 32)
+        seq_len = height * width
+        Z = Z.permute(0, 2, 3, 1).reshape(batch_size, seq_len, channels)  # (batch_size, 1024, 256)
+        return Z
+    def postprocess_latent(self, Z):
+        batch_size, seq_len, channels = Z.shape  # (batch_size, 1024, 256)
+        height = width = int(math.sqrt(seq_len))
+        Z = Z.reshape(batch_size, height, width, channels).permute(0, 3, 1, 2)  # (batch_size, 256, 32, 32)
+        return Z
+    def forward(self, Z):
+        Z = self.preprocess_latent(Z)
+        Z = self.positional_encoding(Z)
+        Z = self.encoder(Z)
+        Z = self.postprocess_latent(Z)
+        return Z # latent of transformer
+class TransformerDecoder(nn.Module):
+    def __init__(self, d_model=256, nhead=8, num_layers=12, dim_feedforward=1024, dropout=0.1):
+        super().__init__()
+        self.d_model = d_model
+        # Enhanced positional encoding
+        self.positional_encoding = PositionalEncoding(d_model)
+        # Multi-layer learnable start tokens
+        self.base_start = nn.Parameter(torch.randn(1, 1024, d_model))
+        self.start_net = nn.Sequential(
+            nn.LayerNorm(d_model),
+            nn.Linear(d_model, dim_feedforward),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(dim_feedforward, d_model),
+            nn.LayerNorm(d_model)
+        )
+        # Context-aware transformer decoder
+        self.decoder = nn.TransformerDecoder(
+            nn.TransformerDecoderLayer(
+                d_model=d_model,
+                nhead=nhead,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                batch_first=True
+            ),
+            num_layers=num_layers
+        )
+        # Output projection with residual
+        self.output_layer = nn.Sequential(
+            nn.Linear(d_model, d_model*2),
+            nn.GELU(),
+            nn.Linear(d_model*2, d_model))
+        self.init_weights()
+    def init_weights(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        nn.init.normal_(self.base_start, mean=0, std=0.02)
+    def preprocess_latent(self, Z):
+        # Convert (B, C, H, W) to (B, H*W, C)
+        return Z.permute(0, 2, 3, 1).flatten(1, 2)
+    def postprocess_latent(self, Z):
+        # Convert (B, H*W, C) back to (B, C, H, W)
+        B, L, C = Z.shape
+        H = W = int(L**0.5)
+        return Z.view(B, H, W, C).permute(0, 3, 1, 2)
+    def forward(self, Z, Z1_start_tokens=None, teacher_forcing_ratio=0.5):
+        # Process input latent
+        Z = self.preprocess_latent(Z)
+        #Z = self.positional_encoding(Z)
+        # Generate enhanced start tokens
+        B = Z.size(0)
+        base_tokens = self.base_start.expand(B, -1, -1)
+        processed_start = self.start_net(base_tokens)
+        # Teacher forcing integration
+        if Z1_start_tokens is not None and teacher_forcing_ratio > 0:
+            Z1_processed = self.positional_encoding(self.preprocess_latent(Z1_start_tokens))
+            # Create mixing mask
+            mask = torch.rand(B, 1, 1, device=Z.device) < teacher_forcing_ratio
+            processed_start = torch.where(mask, Z1_processed, processed_start)
+        # Decoder processing with residual
+        decoder_input = self.positional_encoding(processed_start)
+        outputs = self.decoder(decoder_input, Z)
+        outputs = self.output_layer(outputs + decoder_input)
+        return self.postprocess_latent(outputs)
+class DeepfakeToSourceTransformer(nn.Module):
+    def __init__(self, d_model=256, encoder_nhead=8, decoder_nhead=8, num_encoder_layers=6, num_decoder_layers=12, dim_feedforward=1024, dropout=0.1):
+        super().__init__()
+        self.encoder = TransformerEncoder(
+            d_model=d_model,
+            nhead=encoder_nhead,
+            num_layers=num_encoder_layers,
+            dim_feedforward=1024,
+            dropout=dropout
+        )
+        self.decoder = TransformerDecoder(
+            d_model=d_model,
+            nhead=decoder_nhead,
+            num_layers=num_decoder_layers,
+            dim_feedforward=dim_feedforward,
+            dropout=dropout
+        )
+    def forward(self, Z, Z1_start_tokens=None, teacher_forcing_ratio=0.5):
+        memory = self.encoder(Z)
+        Z1 = self.decoder(memory, Z1_start_tokens, teacher_forcing_ratio=teacher_forcing_ratio)
+        return Z1

modules/segmentface.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import cv2
+import mediapipe as mp
+import numpy as np
+from rembg import remove
+from PIL import Image
+class FaceSegmenter:
+    def __init__(self, threshold=0.5):
+        self.threshold = threshold
+        # Initialize face detection
+        self.face_detection = mp.solutions.face_detection.FaceDetection(
+            model_selection=1,  # 1 for general use, 0 for close-up faces
+            min_detection_confidence=0.5
+        )
+        # Initialize selfie segmentation (for background removal)
+        self.selfie_segmentation = mp.solutions.selfie_segmentation.SelfieSegmentation(
+            model_selection=1  # 1 for general use, 0 for close-up faces
+        )
+    def segment_face(self, image_path):
+        # Load the image
+        image = cv2.imread(image_path)
+        if image is None:
+            raise ValueError("Image not found or unable to load.")
+        # Convert to RGB (MediaPipe requires RGB input)
+        rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        # Step 1: Detect the face
+        face_results = self.face_detection.process(rgb_image)
+        if not face_results.detections:
+            # Use rembg to remove the background
+            with open(image_path, "rb") as input_file:
+                input_image = input_file.read()
+                output_image = remove(input_image)
+            # Convert the output image to a numpy array
+            output_image = np.array(Image.open(io.BytesIO(output_image)))
+            # Convert RGBA to RGB (remove alpha channel)
+            if output_image.shape[2] == 4:
+                output_image = cv2.cvtColor(output_image, cv2.COLOR_RGBA2RGB)
+            return output_image
+        # Get the bounding box of the first detected face
+        detection = face_results.detections[0]
+        bboxC = detection.location_data.relative_bounding_box
+        h, w, _ = image.shape
+        x, y, width, height = int(bboxC.xmin * w), int(bboxC.ymin * h), \
+                              int(bboxC.width * w), int(bboxC.height * h)
+        # Step 2: Segment the foreground (selfie segmentation)
+        segmentation_results = self.selfie_segmentation.process(rgb_image)
+        if segmentation_results.segmentation_mask is None:
+            raise ValueError("Segmentation failed.")
+        # Create a binary mask
+        mask = (segmentation_results.segmentation_mask > self.threshold).astype(np.uint8)
+        # Step 3: Crop the face using the bounding box
+        face_mask = np.zeros_like(mask)
+        face_mask[y:y+height, x:x+width] = mask[y:y+height, x:x+width]
+        # Apply the mask to the original image
+        segmented_face = cv2.bitwise_and(image, image, mask=face_mask)
+        return segmented_face
+    def save_segmented_face(self, image_path, output_path):
+        segmented_face = self.segment_face(image_path)
+        cv2.imwrite(output_path, segmented_face)
+    def show_segmented_face(self, image_path):
+        segmented_face = self.segment_face(image_path)
+        cv2.imshow("Segmented Face", segmented_face)
+        cv2.waitKey(0)
+        cv2.destroyAllWindows()