import gradio as gr import torch import torchvision.transforms as transforms from torchvision.models.detection import detr from PIL import Image import cv2 import numpy as np # Load the pretrained DETR model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = detr.DETR(resnet50=True) model = model.to(device).eval() # Define the transformation for the input image transform = transforms.Compose([ transforms.ToTensor(), transforms.Resize((800, 800)), ]) # Define the object detection function def detect_objects(frame): # Convert the frame to PIL image image = Image.fromarray(frame) # Apply the transformation image = transform(image).unsqueeze(0).to(device) # Perform object detection with torch.no_grad(): outputs = model(image) # Get the bounding boxes and labels boxes = outputs['pred_boxes'][0].cpu().numpy() labels = outputs['pred_classes'][0].cpu().numpy() # Draw bounding boxes on the frame for box, label in zip(boxes, labels): box = [int(coord) for coord in box] frame = cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 2) frame = cv2.putText(frame, f'Class: {label}', (box[0], box[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2, cv2.LINE_AA) return frame # Define the Gradio interface iface = gr.Interface( fn=detect_objects, inputs=gr.Video(), outputs="video", live=True, capture_session=True, ) # Launch the Gradio app iface.launch()