Nutrition_App / multimodal_queries.py
Kilos1's picture
Update multimodal_queries.py
2c4f69d verified
raw
history blame
2.81 kB
import re
import base64
from transformers import AutoModelForCausalLM, AutoTokenizer
import gradio as gr
from PIL import Image
import io
from transformers import Owlv2Processor, Owlv2ForObjectDetection
processor = Owlv2Processor.from_pretrained("google/owlv2-large-patch14-finetuned")
model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-large-patch14-finetuned")
def input_image_setup(uploaded_file):
"""
Encodes the uploaded image file into a base64 string.
Parameters:
- uploaded_file: File-like object uploaded via Gradio.
Returns:
- encoded_image (str): Base64 encoded string of the image data.
"""
if uploaded_file is not None:
bytes_data = uploaded_file.read()
encoded_image = base64.b64encode(bytes_data).decode("utf-8")
return encoded_image
else:
raise FileNotFoundError("No file uploaded")
def generate_model_response(encoded_image, user_query, assistant_prompt="You are a helpful assistant. Answer the following user query in 1 or 2 sentences: "):
"""
Sends an image and a query to the model and retrieves the description or answer.
Parameters:
- encoded_image (str): Base64-encoded image string.
- user_query (str): The user's question about the image.
- assistant_prompt (str): Optional prompt to guide the model's response.
Returns:
- str: The model's response for the given image and query.
"""
# Prepare input for the model
input_text = assistant_prompt + user_query + "\n![Image](data:image/jpeg;base64," + encoded_image + ")"
# Tokenize input text
inputs = tokenizer(input_text, return_tensors="pt")
# Generate response from the model
outputs = model.generate(**inputs)
# Decode and return the model's response
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response_text
def process_image_and_query(uploaded_file, user_query):
"""
Process the uploaded image and user query to generate a response from the model.
Parameters:
- uploaded_file: The uploaded image file.
- user_query: The user's question about the image.
Returns:
- str: The generated response from the model.
"""
# Encode the uploaded image
encoded_image = input_image_setup(uploaded_file)
# Generate response using the encoded image and user query
response = generate_model_response(encoded_image, user_query)
return response
# Create Gradio interface
iface = gr.Interface(
fn=process_image_and_query,
inputs=[
gr.inputs.Image(type="file", label="Upload Image"),
gr.inputs.Textbox(label="User Query", placeholder="Enter your question about the image...")
],
outputs="text",
)
# Launch the Gradio app
iface.launch()