Nutrition_App / multimodal_queries.py
Kilos1's picture
Update multimodal_queries.py
612c5f5 verified
raw
history blame
2.27 kB
import re
import base64
import io
import torch
import gradio as gr
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor
# Load the model and processor
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
model = MllamaForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)
def generate_model_response(image_file, user_query):
"""
Processes the uploaded image and user query to generate a response from the model.
Parameters:
- image_file: The uploaded image file.
- user_query: The user's question about the image.
Returns:
- str: The generated response from the model, formatted as HTML.
"""
try:
# Load and prepare the image
raw_image = Image.open(image_file).convert("RGB")
# Prepare input for the model using the processor
conversation = [
{
"role": "user",
"content": [
{"type": "image", "url": "<|image|>"}, # Placeholder for image
{"type": "text", "text": user_query}
]
}
]
# Apply chat template to prepare inputs for the model
inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
# Process the image and text inputs together
inputs = processor(inputs, raw_image, return_tensors="pt").to(model.device)
# Generate response from the model
outputs = model.generate(**inputs)
# Decode and format the response
generated_text = processor.decode(outputs[0], skip_special_tokens=True)
return generated_text
except Exception as e:
print(f"Error in generating response: {e}")
return f"<p>An error occurred: {str(e)}</p>"
# Gradio Interface
iface = gr.Interface(
fn=generate_model_response,
inputs=[
gr.Image(type="file", label="Upload Image"),
gr.Textbox(label="Enter your question", placeholder="How many calories are in this food?")
],
outputs=gr.HTML(label="Response from Model"),
)
iface.launch(share=True)