from transformers import DPTImageProcessor, DPTForDepthEstimation from segment_anything import SamAutomaticMaskGenerator, sam_model_registry import gradio as gr import supervision as sv import torch import numpy as np from PIL import Image import requests class DepthPredictor: def __init__(self): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.processor = DPTImageProcessor.from_pretrained("Intel/dpt-large").to(self.device) self.model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large").to(self.device) self.model.eval() def predict(self, image): # prepare image for the model inputs = self.processor(images=image, return_tensors="pt").to(self.device) with torch.no_grad(): outputs = self.model(**inputs) predicted_depth = outputs.predicted_depth # interpolate to original size prediction = torch.nn.functional.interpolate( predicted_depth.unsqueeze(1), size=image.size[::-1], mode="bicubic", align_corners=False, ) # visualize the prediction output = prediction.squeeze().cpu().numpy() formatted = (output * 255 / np.max(output)).astype("uint8") depth = Image.fromarray(formatted) return depth class sam_inference: def __init__(self): MODEL_TYPE = "vit_b" checkpoint = "sam_vit_b_01ec64.pth" sam = sam_model_registry[MODEL_TYPE](checkpoint=checkpoint) self.mask_generator = SamAutomaticMaskGenerator(sam) def predict(self, image): sam_result = self.mask_generator.generate(image) mask_annotator = sv.MaskAnnotator() detections = sv.Detections.from_sam(sam_result=sam_result) annotated_image = mask_annotator.annotate(scene=image.copy(), detections=detections) return [annotated_image]