Nutrition_App / multimodal_queries.py
Kilos1's picture
Update multimodal_queries.py
d7dbc2c verified
raw
history blame
2.76 kB
import requests
import base64
from transformers import AutoModelForCausalLM, AutoTokenizer
import gradio as gr
# Load the Hugging Face model and tokenizer
model_id = "meta-llama/llama-3-2-90b-vision-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
def input_image_setup(uploaded_file):
"""
Encodes the uploaded image file into a base64 string.
Parameters:
- uploaded_file: File-like object uploaded via Gradio.
Returns:
- encoded_image (str): Base64 encoded string of the image data.
"""
if uploaded_file is not None:
bytes_data = uploaded_file.read()
encoded_image = base64.b64encode(bytes_data).decode("utf-8")
return encoded_image
else:
raise FileNotFoundError("No file uploaded")
def generate_model_response(encoded_image, user_query, assistant_prompt="You are a helpful assistant. Answer the following user query in 1 or 2 sentences: "):
"""
Sends an image and a query to the model and retrieves the description or answer.
Parameters:
- encoded_image (str): Base64-encoded image string.
- user_query (str): The user's question about the image.
- assistant_prompt (str): Optional prompt to guide the model's response.
Returns:
- str: The model's response for the given image and query.
"""
# Prepare input for the model
input_text = assistant_prompt + user_query + "\n![Image](data:image/jpeg;base64," + encoded_image + ")"
# Tokenize input text
inputs = tokenizer(input_text, return_tensors="pt")
# Generate response from the model
outputs = model.generate(**inputs)
# Decode and return the model's response
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response_text
def process_image_and_query(uploaded_file, user_query):
"""
Process the uploaded image and user query to generate a response from the model.
Parameters:
- uploaded_file: The uploaded image file.
- user_query: The user's question about the image.
Returns:
- str: The generated response from the model.
"""
# Encode the uploaded image
encoded_image = input_image_setup(uploaded_file)
# Generate response using the encoded image and user query
response = generate_model_response(encoded_image, user_query)
return response
# Create Gradio interface
iface = gr.Interface(
fn=process_image_and_query,
inputs=[
gr.inputs.Image(type="file", label="Upload Image"),
gr.inputs.Textbox(label="User Query", placeholder="Enter your question about the image...")
],
outputs="text",
)
# Launch the Gradio app
iface.launch()