import re import base64 from transformers import AutoModelForCausalLM, AutoTokenizer import gradio as gr from PIL import Image import io from transformers import Owlv2Processor, Owlv2ForObjectDetection processor = Owlv2Processor.from_pretrained("google/owlv2-large-patch14-finetuned") model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-large-patch14-finetuned") def input_image_setup(uploaded_file): """ Encodes the uploaded image file into a base64 string. Parameters: - uploaded_file: File-like object uploaded via Gradio. Returns: - encoded_image (str): Base64 encoded string of the image data. """ if uploaded_file is not None: bytes_data = uploaded_file.read() encoded_image = base64.b64encode(bytes_data).decode("utf-8") return encoded_image else: raise FileNotFoundError("No file uploaded") def generate_model_response(encoded_image, user_query, assistant_prompt="You are a helpful assistant. Answer the following user query in 1 or 2 sentences: "): """ Sends an image and a query to the model and retrieves the description or answer. Parameters: - encoded_image (str): Base64-encoded image string. - user_query (str): The user's question about the image. - assistant_prompt (str): Optional prompt to guide the model's response. Returns: - str: The model's response for the given image and query. """ # Prepare input for the model input_text = assistant_prompt + user_query + "\n![Image](data:image/jpeg;base64," + encoded_image + ")" # Tokenize input text inputs = tokenizer(input_text, return_tensors="pt") # Generate response from the model outputs = model.generate(**inputs) # Decode and return the model's response response_text = tokenizer.decode(outputs[0], skip_special_tokens=True) return response_text def process_image_and_query(uploaded_file, user_query): """ Process the uploaded image and user query to generate a response from the model. Parameters: - uploaded_file: The uploaded image file. - user_query: The user's question about the image. Returns: - str: The generated response from the model. """ # Encode the uploaded image encoded_image = input_image_setup(uploaded_file) # Generate response using the encoded image and user query response = generate_model_response(encoded_image, user_query) return response # Create Gradio interface iface = gr.Interface( fn=process_image_and_query, inputs=[ gr.inputs.Image(type="file", label="Upload Image"), gr.inputs.Textbox(label="User Query", placeholder="Enter your question about the image...") ], outputs="text", ) # Launch the Gradio app iface.launch()