Spaces:

Prathamesh1420
/

bottle_lable_gradio

Running

File size: 5,400 Bytes

import cv2
import numpy as np
from ultralytics import YOLO
import cvzone
import base64
import os
import gradio as gr
from langchain_core.messages import HumanMessage
from langchain_google_genai import ChatGoogleGenerativeAI

# ✅ Set up Google API Key
os.environ["GOOGLE_API_KEY"] = "AIzaSyDT0y1kJqgGKiOYiYFMXc-2kTgV_WLbOpA"#os.getenv("GOOGLE_API_KEY")

# ✅ Initialize the Gemini model
gemini_model = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

# ✅ Load the YOLO model
yolo_model = YOLO("best.pt")  
names = yolo_model.names  

def encode_image_to_base64(image):
    """Encodes an image to a base64 string."""
    _, img_buffer = cv2.imencode('.jpg', image)
    return base64.b64encode(img_buffer).decode('utf-8')

def analyze_image_with_gemini(image):
    """Sends an image to Gemini AI for analysis."""
    if image is None or image.shape[0] == 0 or image.shape[1] == 0:
        return "Error: Invalid image."
    
    image_data = encode_image_to_base64(image)
    message = HumanMessage(content=[
        {"type": "text", "text": """
        Analyze this image and determine if the label is present on the bottle.
        Return the result strictly in a structured table format:
        
        | Label Present | Damage |
        |--------------|--------|
        | Yes/No       | Yes/No |
        """},
        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}, "description": "Detected product"}
    ])
    
    try:
        response = gemini_model.invoke([message])
        return response.content
    except Exception as e:
        return f"Error processing image: {e}"

def process_video(video_path):
    """Processes the uploaded video frame by frame using YOLO and Gemini AI."""
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return "Error: Could not open video file.", ""

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    output_video_path = "output.mp4"
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
    
    vertical_center = width // 2
    analyzed_objects = {}  
    log_messages = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        results = yolo_model.track(frame, persist=True)

        if results and results[0].boxes is not None and results[0].boxes.xyxy is not None:
            boxes = results[0].boxes.xyxy.int().cpu().tolist()
            class_ids = results[0].boxes.cls.int().cpu().tolist()
            track_ids = results[0].boxes.id.int().cpu().tolist() if results[0].boxes.id is not None else [-1] * len(boxes)

            for box, track_id, class_id in zip(boxes, track_ids, class_ids):
                x1, y1, x2, y2 = box
                center_x = (x1 + x2) // 2

                # ✅ Apply bounding box only after the bottle reaches the left half of the frame
                if center_x > vertical_center:
                    continue  # Skip drawing before it crosses the center to the left side
                
                # Draw detection box
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                cvzone.putTextRect(frame, f'ID: {track_id}', (x2, y2), 1, 1)
                cvzone.putTextRect(frame, f'{names[class_id]}', (x1, y1), 1, 1)

                # ✅ Ensure label (analysis result) remains visible after detection
                if track_id not in analyzed_objects:
                    crop = frame[y1:y2, x1:x2]
                    response = analyze_image_with_gemini(crop)
                    analyzed_objects[track_id] = response

                    log_messages.append(f"Object {track_id}: {response}")  # ✅ Add log
                    print(f"Object {track_id}: {response}")  # ✅ Print log for debugging

                # 🛠️ Keep analysis text on screen for each analyzed object
                if track_id in analyzed_objects:
                    response_text = analyzed_objects[track_id]
                    text_x = 50  # Left side
                    text_y = height // 2  # Middle of the frame
                    cvzone.putTextRect(frame, response_text, (text_x, text_y), 2, 2, colorT=(255, 255, 255), colorR=(0, 0, 255))

        out.write(frame)

    cap.release()
    out.release()

    return output_video_path, "\n".join(log_messages)  # ✅ Return logs along with the processed video

def gradio_interface(video_path):
    """Handles Gradio video input and processes it."""
    if video_path is None:
        return "Error: No video uploaded.", ""
    
    return process_video(video_path)

# ✅ Sample video file
sample_video_path = "vid4.mp4"  # Make sure this file is available in the working directory

# ✅ Gradio UI setup with sample video
iface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.File(value=sample_video_path, type="filepath", label="Upload Video (Sample Included)"),
    outputs=[
        gr.Video(label="Processed Video"),
        gr.Textbox(label="Processing Logs", lines=10, interactive=False)
    ],
    title="YOLO + Gemini AI Video Analysis",
    description="Upload a video to detect objects and analyze them using Gemini AI.\nA sample video is preloaded for quick testing.",
)

if __name__ == "__main__":
    iface.launch(share=True)