Spaces:

facebook
/

vggt

Running on Zero

App Files Files Community

JianyuanWang commited on Mar 16

Commit

e8227e4

1 Parent(s): e404aa3

update readme

Browse files

Files changed (4) hide show

app.py +13 -12
demo_gradio.py +23 -19
demo_viser.py +15 -101
gradio_util.py → visual_util.py +7 -4

app.py CHANGED Viewed

@@ -15,25 +15,27 @@ from datetime import datetime
 import glob
 import gc
 import time
-import spaces
 sys.path.append("vggt/")
-from gradio_util import predictions_to_glb
 from vggt.models.vggt import VGGT
 from vggt.utils.load_fn import load_and_preprocess_images
 from vggt.utils.pose_enc import pose_encoding_to_extri_intri
 from vggt.utils.geometry import unproject_depth_map_to_point_map
 print("Initializing and loading VGGT model...")
 # model = VGGT.from_pretrained("facebook/VGGT-1B")  # another way to load the model
-# device = "cuda" if torch.cuda.is_available() else "cpu"
 model = VGGT()
 _URL = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt"
 model.load_state_dict(torch.hub.load_state_dict_from_url(_URL))
 model.eval()
 # model = model.to(device)
@@ -41,7 +43,7 @@ model.eval()
 # -------------------------------------------------------------------------
 # 1) Core model inference
 # -------------------------------------------------------------------------
-@spaces.GPU(duration=120)
 def run_model(target_dir, model) -> dict:
     """
     Run the VGGT model on images in the 'target_dir/images' folder and return predictions.
@@ -181,7 +183,7 @@ def update_gallery_on_upload(input_video, input_images):
 # -------------------------------------------------------------------------
 # 4) Reconstruction: uses the target_dir plus any viz parameters
 # -------------------------------------------------------------------------
-@spaces.GPU(duration=120)
 def gradio_demo(
     target_dir,
     conf_thres=3.0,
@@ -313,7 +315,7 @@ def update_visualization(
 # Example images
 # -------------------------------------------------------------------------
-canyon_video = "examples/videos/Studlagil_Canyon_East_Iceland.mp4"
 great_wall_video = "examples/videos/great_wall.mp4"
 colosseum_video = "examples/videos/Colosseum.mp4"
 room_video = "examples/videos/room.mp4"
@@ -392,9 +394,9 @@ with gr.Blocks(
     <h3>Getting Started:</h3>
     <ol>
-        <li><strong>Upload Your Data:</strong> Use the “Upload Video” or “Upload Images” buttons on the left to provide your input. Videos will be automatically split into individual frames (one frame per second).</li>
         <li><strong>Preview:</strong> Your uploaded images will appear in the gallery on the left.</li>
-        <li><strong>Reconstruct:</strong> Click the “Reconstruct” button to start the 3D reconstruction process.</li>
         <li><strong>Visualize:</strong> The 3D reconstruction will appear in the viewer on the right. You can rotate, pan, and zoom to explore the model, and download the GLB file. Note the visualization of 3D points may be slow for a large number of input images.</li>
         <li>
         <strong>Adjust Visualization (Optional):</strong>
@@ -406,17 +408,16 @@ with gr.Blocks(
             <li><em>Show Points from Frame:</em> Select specific frames to display in the point cloud.</li>
             <li><em>Show Camera:</em> Toggle the display of estimated camera positions.</li>
             <li><em>Filter Sky / Filter Black Background:</em> Remove sky or black-background points.</li>
-            <li><em>Select a Prediction Mode:</em> Choose between “Depthmap and Camera Branch” or “Pointmap Branch.”</li>
             </ul>
         </details>
         </li>
     </ol>
-    <p><strong>Please note:</strong> Our method usually only needs less than 1 second to reconstruct a scene, but the visualization of 3D points may take tens of seconds, especially when the number of images is large. Please be patient or, for faster visualization, use a local machine to run our demo from our <a href="https://github.com/facebookresearch/vggt">GitHub repository</a>.</p>
     </div>
     """
     )
     target_dir_output = gr.Textbox(label="Target Dir", visible=False, value="None")
     with gr.Row():
@@ -472,7 +473,7 @@ with gr.Blocks(
         [pyramid_video, "30", None, 35.0, False, False, True, False, "Depthmap and Camera Branch", "True"],
         [single_cartoon_video, "1", None, 15.0, False, False, True, False, "Depthmap and Camera Branch", "True"],
         [single_oil_painting_video, "1", None, 20.0, False, True, True, True, "Depthmap and Camera Branch", "True"],
-        [canyon_video, "14", None, 40.0, False, False, True, False, "Depthmap and Camera Branch", "True"],
         [room_video, "8", None, 5.0, False, False, True, False, "Depthmap and Camera Branch", "True"],
         [kitchen_video, "25", None, 50.0, False, False, True, False, "Depthmap and Camera Branch", "True"],
         [fern_video, "20", None, 45.0, False, False, True, False, "Depthmap and Camera Branch", "True"],

 import glob
 import gc
 import time
+# import spaces
 sys.path.append("vggt/")
+from visual_util import predictions_to_glb
 from vggt.models.vggt import VGGT
 from vggt.utils.load_fn import load_and_preprocess_images
 from vggt.utils.pose_enc import pose_encoding_to_extri_intri
 from vggt.utils.geometry import unproject_depth_map_to_point_map
+# device = "cuda" if torch.cuda.is_available() else "cpu"
 print("Initializing and loading VGGT model...")
 # model = VGGT.from_pretrained("facebook/VGGT-1B")  # another way to load the model
 model = VGGT()
 _URL = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt"
 model.load_state_dict(torch.hub.load_state_dict_from_url(_URL))
 model.eval()
 # model = model.to(device)
 # -------------------------------------------------------------------------
 # 1) Core model inference
 # -------------------------------------------------------------------------
+# @spaces.GPU(duration=120)
 def run_model(target_dir, model) -> dict:
     """
     Run the VGGT model on images in the 'target_dir/images' folder and return predictions.
 # -------------------------------------------------------------------------
 # 4) Reconstruction: uses the target_dir plus any viz parameters
 # -------------------------------------------------------------------------
+# @spaces.GPU(duration=120)
 def gradio_demo(
     target_dir,
     conf_thres=3.0,
 # Example images
 # -------------------------------------------------------------------------
+# canyon_video = "examples/videos/Studlagil_Canyon_East_Iceland.mp4"
 great_wall_video = "examples/videos/great_wall.mp4"
 colosseum_video = "examples/videos/Colosseum.mp4"
 room_video = "examples/videos/room.mp4"
     <h3>Getting Started:</h3>
     <ol>
+        <li><strong>Upload Your Data:</strong> Use the "Upload Video" or "Upload Images" buttons on the left to provide your input. Videos will be automatically split into individual frames (one frame per second).</li>
         <li><strong>Preview:</strong> Your uploaded images will appear in the gallery on the left.</li>
+        <li><strong>Reconstruct:</strong> Click the "Reconstruct" button to start the 3D reconstruction process.</li>
         <li><strong>Visualize:</strong> The 3D reconstruction will appear in the viewer on the right. You can rotate, pan, and zoom to explore the model, and download the GLB file. Note the visualization of 3D points may be slow for a large number of input images.</li>
         <li>
         <strong>Adjust Visualization (Optional):</strong>
             <li><em>Show Points from Frame:</em> Select specific frames to display in the point cloud.</li>
             <li><em>Show Camera:</em> Toggle the display of estimated camera positions.</li>
             <li><em>Filter Sky / Filter Black Background:</em> Remove sky or black-background points.</li>
+            <li><em>Select a Prediction Mode:</em> Choose between "Depthmap and Camera Branch" or "Pointmap Branch."</li>
             </ul>
         </details>
         </li>
     </ol>
+    <p><strong style="color: #0ea5e9;">Please note:</strong> <span style="color: #0ea5e9; font-weight: bold;">Our method usually only needs less than 1 second to reconstruct a scene, but the visualization of 3D points may take tens of seconds</span>, especially when the number of images is large. Please be patient or, for faster visualization, use a local machine to run our demo from our <a href="https://github.com/facebookresearch/vggt">GitHub repository</a>.</p>
     </div>
     """
     )
     target_dir_output = gr.Textbox(label="Target Dir", visible=False, value="None")
     with gr.Row():
         [pyramid_video, "30", None, 35.0, False, False, True, False, "Depthmap and Camera Branch", "True"],
         [single_cartoon_video, "1", None, 15.0, False, False, True, False, "Depthmap and Camera Branch", "True"],
         [single_oil_painting_video, "1", None, 20.0, False, True, True, True, "Depthmap and Camera Branch", "True"],
+        # [canyon_video, "14", None, 40.0, False, False, True, False, "Depthmap and Camera Branch", "True"],
         [room_video, "8", None, 5.0, False, False, True, False, "Depthmap and Camera Branch", "True"],
         [kitchen_video, "25", None, 50.0, False, False, True, False, "Depthmap and Camera Branch", "True"],
         [fern_video, "20", None, 45.0, False, False, True, False, "Depthmap and Camera Branch", "True"],

demo_gradio.py CHANGED Viewed

@@ -18,20 +18,22 @@ import time
 sys.path.append("vggt/")
-from gradio_util import predictions_to_glb
 from vggt.models.vggt import VGGT
 from vggt.utils.load_fn import load_and_preprocess_images
 from vggt.utils.pose_enc import pose_encoding_to_extri_intri
 from vggt.utils.geometry import unproject_depth_map_to_point_map
 print("Initializing and loading VGGT model...")
 # model = VGGT.from_pretrained("facebook/VGGT-1B")  # another way to load the model
-device = "cuda" if torch.cuda.is_available() else "cpu"
 model = VGGT()
 _URL = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt"
 model.load_state_dict(torch.hub.load_state_dict_from_url(_URL))
 model.eval()
 model = model.to(device)
@@ -375,35 +377,37 @@ with gr.Blocks(
     is_example = gr.Textbox(label="is_example", visible=False, value="None")
     num_images = gr.Textbox(label="num_images", visible=False, value="None")
-    gr.Markdown(
-        """
-    # 🏛️ VGGT: Visual Geometry Grounded Transformer
-    [🐙 GitHub Repository](https://github.com/facebookresearch/vggt) | [Project Page]()
     <div style="font-size: 16px; line-height: 1.5;">
-    <p>Upload a video or a set of images to create a 3D reconstruction of a scene or object.  VGGT takes these images and generates a 3D point cloud, along with estimated camera poses.</p>
     <h3>Getting Started:</h3>
     <ol>
-        <li><strong>Upload Your Data:</strong> Use the "Upload Video" or "Upload Images" buttons on the left to provide your input.  Videos will be automatically split into individual frames (one frame per second).</li>
-        <li><strong>Preview:</strong>  Your uploaded images will appear in the gallery on the left.</li>
-        <li><strong>Reconstruct:</strong> Click the "Reconstruct" button to start the 3D reconstruction process.</li>
-        <li><strong>Visualize:</strong>  The 3D reconstruction will appear in the viewer on the right.  You can rotate, pan, and zoom to explore the model, and download the GLB file. Note the visualization of 3D points may be slow for large number of input images. </li>
-    <li>
         <strong>Adjust Visualization (Optional):</strong>
-        After reconstruction, you can fine-tune the visualization using the options below
         <details style="display:inline;">
-        <summary style="display:inline;">(<strong>click to expand</strong>):</summary>
-        <ul>
             <li><em>Confidence Threshold:</em> Adjust the filtering of points based on confidence.</li>
             <li><em>Show Points from Frame:</em> Select specific frames to display in the point cloud.</li>
             <li><em>Show Camera:</em> Toggle the display of estimated camera positions.</li>
             <li><em>Filter Sky / Filter Black Background:</em> Remove sky or black-background points.</li>
-            <li><em>Select a Prediction Mode:</em> Choose between "Depthmap and Camera Branch" or "Pointmap Branch."</li>
-        </ul>
         </details>
-    </li>
     </ol>
     <p><strong>Please note:</strong> Our method usually only needs less than 1 second to reconstruct a scene, but the visualization of 3D points may take tens of seconds, especially when the number of images is large. Please be patient or, for faster visualization, use a local machine to run our demo from our <a href="https://github.com/facebookresearch/vggt">GitHub repository</a>.</p>
     </div>

 sys.path.append("vggt/")
+from visual_util import predictions_to_glb
 from vggt.models.vggt import VGGT
 from vggt.utils.load_fn import load_and_preprocess_images
 from vggt.utils.pose_enc import pose_encoding_to_extri_intri
 from vggt.utils.geometry import unproject_depth_map_to_point_map
+device = "cuda" if torch.cuda.is_available() else "cpu"
 print("Initializing and loading VGGT model...")
 # model = VGGT.from_pretrained("facebook/VGGT-1B")  # another way to load the model
 model = VGGT()
 _URL = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt"
 model.load_state_dict(torch.hub.load_state_dict_from_url(_URL))
 model.eval()
 model = model.to(device)
     is_example = gr.Textbox(label="is_example", visible=False, value="None")
     num_images = gr.Textbox(label="num_images", visible=False, value="None")
+    gr.HTML(
+    """
+    <h1>🏛️ VGGT: Visual Geometry Grounded Transformer</h1>
+    <p>
+    <a href="https://github.com/facebookresearch/vggt">🐙 GitHub Repository</a> |
+    <a href="#">Project Page</a>
+    </p>
     <div style="font-size: 16px; line-height: 1.5;">
+    <p>Upload a video or a set of images to create a 3D reconstruction of a scene or object. VGGT takes these images and generates a 3D point cloud, along with estimated camera poses.</p>
     <h3>Getting Started:</h3>
     <ol>
+        <li><strong>Upload Your Data:</strong> Use the “Upload Video” or “Upload Images” buttons on the left to provide your input. Videos will be automatically split into individual frames (one frame per second).</li>
+        <li><strong>Preview:</strong> Your uploaded images will appear in the gallery on the left.</li>
+        <li><strong>Reconstruct:</strong> Click the “Reconstruct” button to start the 3D reconstruction process.</li>
+        <li><strong>Visualize:</strong> The 3D reconstruction will appear in the viewer on the right. You can rotate, pan, and zoom to explore the model, and download the GLB file. Note the visualization of 3D points may be slow for a large number of input images.</li>
+        <li>
         <strong>Adjust Visualization (Optional):</strong>
+        After reconstruction, you can fine-tune the visualization using the options below
         <details style="display:inline;">
+            <summary style="display:inline;">(<strong>click to expand</strong>):</summary>
+            <ul>
             <li><em>Confidence Threshold:</em> Adjust the filtering of points based on confidence.</li>
             <li><em>Show Points from Frame:</em> Select specific frames to display in the point cloud.</li>
             <li><em>Show Camera:</em> Toggle the display of estimated camera positions.</li>
             <li><em>Filter Sky / Filter Black Background:</em> Remove sky or black-background points.</li>
+            <li><em>Select a Prediction Mode:</em> Choose between “Depthmap and Camera Branch” or “Pointmap Branch.”</li>
+            </ul>
         </details>
+        </li>
     </ol>
     <p><strong>Please note:</strong> Our method usually only needs less than 1 second to reconstruct a scene, but the visualization of 3D points may take tens of seconds, especially when the number of images is large. Please be patient or, for faster visualization, use a local machine to run our demo from our <a href="https://github.com/facebookresearch/vggt">GitHub repository</a>.</p>
     </div>

demo_viser.py CHANGED Viewed

@@ -10,7 +10,6 @@ import time
 import threading
 import argparse
 from typing import List, Optional
-import copy
 import numpy as np
 import torch
@@ -18,12 +17,14 @@ from tqdm.auto import tqdm
 import viser
 import viser.transforms as viser_tf
 import cv2
-import requests
 try:
     import onnxruntime
 except ImportError:
     print("onnxruntime not found. Sky segmentation may not work.")
 from vggt.models.vggt import VGGT
 from vggt.utils.load_fn import load_and_preprocess_images
 from vggt.utils.geometry import closed_form_inverse_se3, unproject_depth_map_to_point_map
@@ -95,7 +96,7 @@ def viser_wrapper(
     # Flatten
     points = world_points.reshape(-1, 3)
     colors_flat = (colors.reshape(-1, 3) * 255).astype(np.uint8)
-    conf = conf.reshape(-1)
     cam_to_world_mat = closed_form_inverse_se3(extrinsics_cam)  # shape (S, 4, 4) typically
     # For convenience, we store only (3,4) portion
@@ -132,13 +133,12 @@ def viser_wrapper(
     # Create the main point cloud handle
     # Compute the threshold value as the given percentile
-    init_threshold_val = np.percentile(conf, init_conf_threshold)
-    init_conf_mask = conf > init_threshold_val
     point_cloud = server.scene.add_point_cloud(
         name="viser_pcd",
         points=points_centered[init_conf_mask],
         colors=colors_flat[init_conf_mask],
-        # point_size=0.0001,
         point_size=0.001,
         point_shape="circle",
     )
@@ -213,8 +213,11 @@ def viser_wrapper(
         """Update the point cloud based on current GUI selections."""
         # Here we compute the threshold value based on the current percentage
         current_percentage = gui_points_conf.value
-        threshold_val = np.percentile(conf, current_percentage)
-        conf_mask = conf > threshold_val
         if gui_frame_selector.value == "All":
             frame_mask = np.ones_like(conf_mask, dtype=bool)
@@ -264,30 +267,6 @@ def viser_wrapper(
 # Helper functions for sky segmentation
-def download_file_from_url(url, filename):
-    """Downloads a file from a Hugging Face model repo, handling redirects."""
-    try:
-        # Get the redirect URL
-        response = requests.get(url, allow_redirects=False)
-        response.raise_for_status()  # Raise HTTPError for bad requests (4xx or 5xx)
-        if response.status_code == 302:  # Expecting a redirect
-            redirect_url = response.headers["Location"]
-            response = requests.get(redirect_url, stream=True)
-            response.raise_for_status()
-        else:
-            print(f"Unexpected status code: {response.status_code}")
-            return
-        with open(filename, "wb") as f:
-            for chunk in response.iter_content(chunk_size=8192):
-                f.write(chunk)
-        print(f"Downloaded {filename} successfully.")
-    except requests.exceptions.RequestException as e:
-        print(f"Error downloading file: {e}")
 def apply_sky_segmentation(conf: np.ndarray, image_folder: str) -> np.ndarray:
     """
@@ -335,7 +314,7 @@ def apply_sky_segmentation(conf: np.ndarray, image_folder: str) -> np.ndarray:
     # Convert list to numpy array with shape S×H×W
     sky_mask_array = np.array(sky_mask_list)
     # Apply sky mask to confidence scores
-    sky_mask_binary = (sky_mask_array > 0.01).astype(np.float32)
     conf = conf * sky_mask_binary
     print("Sky segmentation applied successfully")
@@ -343,73 +322,6 @@ def apply_sky_segmentation(conf: np.ndarray, image_folder: str) -> np.ndarray:
-def segment_sky(image_path, onnx_session, mask_filename=None):
-    """
-    Segments sky from an image using an ONNX model.
-    Args:
-        image_path: Path to input image
-        onnx_session: ONNX runtime session with loaded model
-        mask_filename: Path to save the output mask
-    Returns:
-        np.ndarray: Binary mask where 255 indicates non-sky regions
-    """
-    assert mask_filename is not None
-    image = cv2.imread(image_path)
-    result_map = run_skyseg(onnx_session, [320, 320], image)
-    # resize the result_map to the original image size
-    result_map_original = cv2.resize(result_map, (image.shape[1], image.shape[0]))
-    output_mask = np.zeros_like(result_map_original)
-    output_mask[result_map_original < 1] = 1
-    output_mask = output_mask.astype(np.uint8) * 255
-    os.makedirs(os.path.dirname(mask_filename), exist_ok=True)
-    cv2.imwrite(mask_filename, output_mask)
-    return output_mask
-def run_skyseg(onnx_session, input_size, image):
-    """
-    Runs sky segmentation inference using ONNX model.
-    Args:
-        onnx_session: ONNX runtime session
-        input_size: Target size for model input (width, height)
-        image: Input image in BGR format
-    Returns:
-        np.ndarray: Segmentation mask
-    """
-    # Pre process:Resize, BGR->RGB, Transpose, PyTorch standardization, float32 cast
-    temp_image = copy.deepcopy(image)
-    resize_image = cv2.resize(temp_image, dsize=(input_size[0], input_size[1]))
-    x = cv2.cvtColor(resize_image, cv2.COLOR_BGR2RGB)
-    x = np.array(x, dtype=np.float32)
-    mean = [0.485, 0.456, 0.406]
-    std = [0.229, 0.224, 0.225]
-    x = (x / 255 - mean) / std
-    x = x.transpose(2, 0, 1)
-    x = x.reshape(-1, 3, input_size[0], input_size[1]).astype("float32")
-    # Inference
-    input_name = onnx_session.get_inputs()[0].name
-    output_name = onnx_session.get_outputs()[0].name
-    onnx_result = onnx_session.run([output_name], {input_name: x})
-    # Post process
-    onnx_result = np.array(onnx_result).squeeze()
-    min_value = np.min(onnx_result)
-    max_value = np.max(onnx_result)
-    onnx_result = (onnx_result - min_value) / (max_value - min_value)
-    onnx_result *= 255
-    onnx_result = onnx_result.astype("uint8")
-    return onnx_result
@@ -450,6 +362,8 @@ def main():
     print(f"Using device: {device}")
     print("Initializing and loading VGGT model...")
     model = VGGT()
     _URL = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt"
     model.load_state_dict(torch.hub.load_state_dict_from_url(_URL))
@@ -503,4 +417,4 @@ def main():
 if __name__ == "__main__":
-    main()

 import threading
 import argparse
 from typing import List, Optional
 import numpy as np
 import torch
 import viser
 import viser.transforms as viser_tf
 import cv2
 try:
     import onnxruntime
 except ImportError:
     print("onnxruntime not found. Sky segmentation may not work.")
+from visual_util import segment_sky, download_file_from_url
 from vggt.models.vggt import VGGT
 from vggt.utils.load_fn import load_and_preprocess_images
 from vggt.utils.geometry import closed_form_inverse_se3, unproject_depth_map_to_point_map
     # Flatten
     points = world_points.reshape(-1, 3)
     colors_flat = (colors.reshape(-1, 3) * 255).astype(np.uint8)
+    conf_flat = conf.reshape(-1)
     cam_to_world_mat = closed_form_inverse_se3(extrinsics_cam)  # shape (S, 4, 4) typically
     # For convenience, we store only (3,4) portion
     # Create the main point cloud handle
     # Compute the threshold value as the given percentile
+    init_threshold_val = np.percentile(conf_flat, init_conf_threshold)
+    init_conf_mask = (conf_flat >= init_threshold_val) & (conf_flat > 0.1)
     point_cloud = server.scene.add_point_cloud(
         name="viser_pcd",
         points=points_centered[init_conf_mask],
         colors=colors_flat[init_conf_mask],
         point_size=0.001,
         point_shape="circle",
     )
         """Update the point cloud based on current GUI selections."""
         # Here we compute the threshold value based on the current percentage
         current_percentage = gui_points_conf.value
+        threshold_val = np.percentile(conf_flat, current_percentage)
+        print(f"Threshold absolute value: {threshold_val}, percentage: {current_percentage}%")
+        conf_mask = (conf_flat >= threshold_val) & (conf_flat > 1e-5)
         if gui_frame_selector.value == "All":
             frame_mask = np.ones_like(conf_mask, dtype=bool)
 # Helper functions for sky segmentation
 def apply_sky_segmentation(conf: np.ndarray, image_folder: str) -> np.ndarray:
     """
     # Convert list to numpy array with shape S×H×W
     sky_mask_array = np.array(sky_mask_list)
     # Apply sky mask to confidence scores
+    sky_mask_binary = (sky_mask_array > 0.1).astype(np.float32)
     conf = conf * sky_mask_binary
     print("Sky segmentation applied successfully")
     print(f"Using device: {device}")
     print("Initializing and loading VGGT model...")
+    # model = VGGT.from_pretrained("facebook/VGGT-1B")
     model = VGGT()
     _URL = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt"
     model.load_state_dict(torch.hub.load_state_dict_from_url(_URL))
 if __name__ == "__main__":
+    main()

gradio_util.py → visual_util.py RENAMED Viewed

@@ -131,7 +131,7 @@ def predictions_to_glb(
             sky_mask_array = np.array(sky_mask_list)
             # Apply sky mask to confidence scores
-            sky_mask_binary = (sky_mask_array > 0.01).astype(np.float32)
             pred_world_points_conf = pred_world_points_conf * sky_mask_binary
     if selected_frame_idx is not None:
@@ -155,7 +155,7 @@ def predictions_to_glb(
     else:
         conf_threshold = np.percentile(conf, conf_thres)
-    conf_mask = conf >= conf_threshold
     if mask_black_bg:
         black_bg_mask = colors_rgb.sum(axis=1) >= 16
@@ -370,6 +370,7 @@ def compute_camera_faces(cone_shape: trimesh.Trimesh) -> np.ndarray:
 def segment_sky(image_path, onnx_session, mask_filename=None):
     """
     Segments sky from an image using an ONNX model.
     Args:
         image_path: Path to input image
@@ -387,9 +388,11 @@ def segment_sky(image_path, onnx_session, mask_filename=None):
     # resize the result_map to the original image size
     result_map_original = cv2.resize(result_map, (image.shape[1], image.shape[0]))
     output_mask = np.zeros_like(result_map_original)
-    output_mask[result_map_original < 1] = 1
-    output_mask = output_mask.astype(np.uint8) * 255
     os.makedirs(os.path.dirname(mask_filename), exist_ok=True)
     cv2.imwrite(mask_filename, output_mask)
     return output_mask

             sky_mask_array = np.array(sky_mask_list)
             # Apply sky mask to confidence scores
+            sky_mask_binary = (sky_mask_array > 0.1).astype(np.float32)
             pred_world_points_conf = pred_world_points_conf * sky_mask_binary
     if selected_frame_idx is not None:
     else:
         conf_threshold = np.percentile(conf, conf_thres)
+    conf_mask = (conf >= conf_threshold) & (conf > 1e-5)
     if mask_black_bg:
         black_bg_mask = colors_rgb.sum(axis=1) >= 16
 def segment_sky(image_path, onnx_session, mask_filename=None):
     """
     Segments sky from an image using an ONNX model.
+    Thanks for the great model provided by https://github.com/xiongzhu666/Sky-Segmentation-and-Post-processing
     Args:
         image_path: Path to input image
     # resize the result_map to the original image size
     result_map_original = cv2.resize(result_map, (image.shape[1], image.shape[0]))
+    # Fix: Invert the mask so that 255 = non-sky, 0 = sky
+    # The model outputs low values for sky, high values for non-sky
     output_mask = np.zeros_like(result_map_original)
+    output_mask[result_map_original < 32] = 255  # Use threshold of 32
     os.makedirs(os.path.dirname(mask_filename), exist_ok=True)
     cv2.imwrite(mask_filename, output_mask)
     return output_mask