Spaces:

sayedM
/

DINOv3-PCA-visualization

Running

App Files Files Community

sayedM commited on 21 days ago

Commit

471a3ca

verified ·

1 Parent(s): 53fb72b

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -12

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import torch
 import gradio as gr
 import numpy as np
@@ -5,16 +6,18 @@ from PIL import Image
 import torchvision.transforms.functional as TF
 from matplotlib import colormaps
 from transformers import AutoModel
 # ----------------------------
 # Configuration
 # ----------------------------
 # The model will be downloaded from the Hugging Face Hub
-MODEL_ID = "facebook/dinov3-vith16plus-pretrain-lvd1689m"
 PATCH_SIZE = 16
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# Normalization constants
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
 IMAGENET_STD = (0.229, 0.224, 0.225)
@@ -25,14 +28,17 @@ def load_model_from_hub():
     """Loads the DINOv3 model from the Hugging Face Hub."""
     print(f"Loading model '{MODEL_ID}' from Hugging Face Hub...")
     try:
-        model = AutoModel.from_pretrained(MODEL_ID)
         model.to(DEVICE).eval()
         print(f"✅ Model loaded successfully on device: {DEVICE}")
         return model
     except Exception as e:
         print(f"❌ Failed to load model: {e}")
-        gr.Error(f"Could not load model from Hub: {e}")
-        return None
 # Load the model globally when the app starts
 model = load_model_from_hub()
@@ -79,7 +85,7 @@ def generate_pca_visuals(
 ):
     """Main function to generate PCA visuals."""
     if model is None:
-        raise gr.Error("DINOv3 model could not be loaded. Check the logs.")
     if image_pil is None:
         return None, None, "Please upload an image and click Generate.", None, None
@@ -94,20 +100,24 @@ def generate_pca_visuals(
     # 2. Feature Extraction
     progress(0.5, desc="🦖 Extracting features with DINOv3...")
     outputs = model(t_norm)
-    # The patch embeddings are in last_hidden_state, we skip the first token (CLS)
-    patch_embeddings = outputs.last_hidden_state.squeeze(0)[1:, :]
     # 3. PCA Calculation
     progress(0.8, desc="🔬 Performing PCA...")
     X_centered = patch_embeddings.float() - patch_embeddings.float().mean(0, keepdim=True)
     U, S, V = torch.pca_lowrank(X_centered, q=3, center=False)
-    # Stabilize the signs of the eigenvectors for deterministic output
     for i in range(V.shape[1]):
         max_abs_idx = torch.argmax(torch.abs(V[:, i]))
         if V[max_abs_idx, i] < 0:
             V[:, i] *= -1
     scores = X_centered @ V[:, :3]
     # 4. Explained Variance
@@ -121,8 +131,10 @@ def generate_pca_visuals(
     )
     # 5. Create Visualizations
     pc1_map = scores[:, 0].reshape(Hp, Wp).cpu().numpy()
     pc1_image_raw = colorize(pc1_map, cmap_name)
     pc_rgb_map = scores.reshape(Hp, Wp, 3).cpu().numpy()
     min_vals = pc_rgb_map.reshape(-1, 3).min(axis=0)
     max_vals = pc_rgb_map.reshape(-1, 3).max(axis=0)
@@ -137,7 +149,6 @@ def generate_pca_visuals(
     progress(1.0, desc="✅ Done!")
     return pc1_image_smooth, pc_rgb_image_smooth, variance_text, blended_image, original_processed_image
 # ----------------------------
 # Gradio Interface
 # ----------------------------
@@ -152,7 +163,8 @@ with gr.Blocks(theme=gr.themes.Soft(), title="DINOv3 PCA Explorer") as demo:
     with gr.Row():
         with gr.Column(scale=2):
-            input_image = gr.Image(type="pil", label="Upload Image", value="https://picsum.photos/id/1011/800/600")
             with gr.Accordion("⚙️ Visualization Controls", open=True):
                 resolution_slider = gr.Slider(

+# app.py
 import torch
 import gradio as gr
 import numpy as np
 import torchvision.transforms.functional as TF
 from matplotlib import colormaps
 from transformers import AutoModel
+import os
 # ----------------------------
 # Configuration
 # ----------------------------
 # The model will be downloaded from the Hugging Face Hub
+# Using the specific revision that works well with transformers AutoModel
+MODEL_ID = "facebook/dinov3-vith16plus"
 PATCH_SIZE = 16
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Normalization constants (standard for ImageNet)
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
 IMAGENET_STD = (0.229, 0.224, 0.225)
     """Loads the DINOv3 model from the Hugging Face Hub."""
     print(f"Loading model '{MODEL_ID}' from Hugging Face Hub...")
     try:
+        # Use your HF token if the model is gated
+        # You can set this as a secret in your Hugging Face Space settings
+        token = os.environ.get("HF_TOKEN")
+        model = AutoModel.from_pretrained(MODEL_ID, token=token, trust_remote_code=True)
         model.to(DEVICE).eval()
         print(f"✅ Model loaded successfully on device: {DEVICE}")
         return model
     except Exception as e:
         print(f"❌ Failed to load model: {e}")
+        # This will display an error message in the Gradio interface
+        raise gr.Error(f"Could not load model from Hub. If it's a gated model, ensure you have access and have set your HF_TOKEN secret in the Space settings. Error: {e}")
 # Load the model globally when the app starts
 model = load_model_from_hub()
 ):
     """Main function to generate PCA visuals."""
     if model is None:
+        raise gr.Error("DINOv3 model is not available. Check the startup logs.")
     if image_pil is None:
         return None, None, "Please upload an image and click Generate.", None, None
     # 2. Feature Extraction
     progress(0.5, desc="🦖 Extracting features with DINOv3...")
     outputs = model(t_norm)
+    # 💡 FIX: The model output includes a [CLS] token AND 4 register tokens.
+    # We must skip all of them (total 5) to get only the patch embeddings.
+    # The original code only skipped 1, causing the size mismatch.
+    n_special_tokens = 5 # 1 [CLS] token + 4 register tokens
+    patch_embeddings = outputs.last_hidden_state.squeeze(0)[n_special_tokens:, :]
     # 3. PCA Calculation
     progress(0.8, desc="🔬 Performing PCA...")
     X_centered = patch_embeddings.float() - patch_embeddings.float().mean(0, keepdim=True)
     U, S, V = torch.pca_lowrank(X_centered, q=3, center=False)
+    # 💡 IMPROVEMENT: Stabilize the signs of the eigenvectors for deterministic output.
+    # This prevents the colors from randomly inverting on different runs.
     for i in range(V.shape[1]):
         max_abs_idx = torch.argmax(torch.abs(V[:, i]))
         if V[max_abs_idx, i] < 0:
             V[:, i] *= -1
     scores = X_centered @ V[:, :3]
     # 4. Explained Variance
     )
     # 5. Create Visualizations
+    # This part should now work correctly as `scores` has the right shape (Hp*Wp, 3)
     pc1_map = scores[:, 0].reshape(Hp, Wp).cpu().numpy()
     pc1_image_raw = colorize(pc1_map, cmap_name)
     pc_rgb_map = scores.reshape(Hp, Wp, 3).cpu().numpy()
     min_vals = pc_rgb_map.reshape(-1, 3).min(axis=0)
     max_vals = pc_rgb_map.reshape(-1, 3).max(axis=0)
     progress(1.0, desc="✅ Done!")
     return pc1_image_smooth, pc_rgb_image_smooth, variance_text, blended_image, original_processed_image
 # ----------------------------
 # Gradio Interface
 # ----------------------------
     with gr.Row():
         with gr.Column(scale=2):
+            # Added a default image URL for convenience
+            input_image = gr.Image(type="pil", label="Upload Image", value="https://images.squarespace-cdn.com/content/v1/607f89e638219e13eee71b1e/1684821560422-SD5V37BAG28BURTLIXUQ/michael-sum-LEpfefQf4rU-unsplash.jpg")
             with gr.Accordion("⚙️ Visualization Controls", open=True):
                 resolution_slider = gr.Slider(