import gradio as gr from PIL import Image, ImageDraw,ImageFont import scipy.io.wavfile as wavfile # Use a pipeline as a high-level helper from transformers import pipeline model_path = ("../Model/models--facebook--detr-resnet-50/snapshots" "/1d5f47bd3bdd2c4bbfa585418ffe6da5028b4c0b") tts_model_path = ("../Model/models--kakao-enterprise--vits-ljs/snapshots" "/3bcb8321394f671bd948ebf0d086d694dda95464") narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs") object_detector = pipeline("object-detection", model="facebook/detr-resnet-50") # object_detector = pipeline("object-detection", model=model_path) # narrator = pipeline("text-to-speech", model=tts_model_path) def generate_audio(text): narrated_text = narrator(text) wavfile.write("finetuned_output.wav", rate=narrated_text["sampling_rate"], data=narrated_text["audio"][0]) return "finetuned_output.wav"; def read_objects(detection_objects): # Initialize counters for each object label object_counts = {} # Count the occurrences of each label for detection in detection_objects: label = detection['label'] if label in object_counts: object_counts[label] += 1 else: object_counts[label] = 1 # Generate the response string response = "This picture contains" labels = list(object_counts.keys()) for i, label in enumerate(labels): response += f" {object_counts[label]} {label}" if object_counts[label] > 1: response += "s" if i < len(labels) - 2: response += "," elif i == len(labels) - 2: response += " and" response += "." return response def draw_bounding_boxes(image, detection_results): """ Draws bounding boxes on the provided image based on the detection results. Parameters: image (PIL.Image): The input image to be annotated. detection_results (list): A list of dictionaries, each containing the detected object details. Returns: PIL.Image: The image with bounding boxes drawn around the detected objects. """ # Convert the input image to ImageDraw object to draw on it draw = ImageDraw.Draw(image) # Iterate through each detection result for result in detection_results: # Extract the bounding box coordinates and label box = result['box'] label = result['label'] score = result['score'] # Define coordinates for the bounding box xmin, ymin, xmax, ymax = box['xmin'], box['ymin'], box['xmax'], box['ymax'] # Draw the bounding box (with a red outline) draw.rectangle([xmin, ymin, xmax, ymax], outline="red", width=3) # Optionally, add label with score near the bounding box text = f"{label} ({score * 100:.1f}%)" draw.text((xmin, ymin - 10), text, fill="red") return image def detect_objects(image): raw_image = image output = object_detector(raw_image) processed_image = draw_bounding_boxes(raw_image, output) naturalized_text = read_objects(output) processed_audio = generate_audio(naturalized_text) return processed_image, processed_audio demo = gr.Interface(fn = detect_objects, inputs=[gr.Image(label="Select Image",type="pil")], outputs=[gr.Image(label="Summarized Text ",type="pil"), gr.Audio(label="Generated Audio")], title="@SherryAhuja Project : Object Detection with Audio", description="This AI application will be used to Detect objects in an image and generate audio.",) demo.launch()