Spaces:

Kilos1
/

Nutrition_App

Runtime error

App Files Files Community

Nutrition_App / multimodal_queries.py

Kilos1

Update multimodal_queries.py

d7dbc2c verified 4 months ago

raw

history blame

2.76 kB

	import requests
	import base64
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import gradio as gr

	# Load the Hugging Face model and tokenizer
	model_id = "meta-llama/llama-3-2-90b-vision-instruct"
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(model_id)

	def input_image_setup(uploaded_file):
	"""
	Encodes the uploaded image file into a base64 string.

	Parameters:
	- uploaded_file: File-like object uploaded via Gradio.

	Returns:
	- encoded_image (str): Base64 encoded string of the image data.
	"""
	if uploaded_file is not None:
	bytes_data = uploaded_file.read()
	encoded_image = base64.b64encode(bytes_data).decode("utf-8")
	return encoded_image
	else:
	raise FileNotFoundError("No file uploaded")

	def generate_model_response(encoded_image, user_query, assistant_prompt="You are a helpful assistant. Answer the following user query in 1 or 2 sentences: "):
	"""
	Sends an image and a query to the model and retrieves the description or answer.

	Parameters:
	- encoded_image (str): Base64-encoded image string.
	- user_query (str): The user's question about the image.
	- assistant_prompt (str): Optional prompt to guide the model's response.

	Returns:
	- str: The model's response for the given image and query.
	"""

	# Prepare input for the model
	input_text = assistant_prompt + user_query + "\n![Image](data:image/jpeg;base64," + encoded_image + ")"

	# Tokenize input text
	inputs = tokenizer(input_text, return_tensors="pt")

	# Generate response from the model
	outputs = model.generate(**inputs)

	# Decode and return the model's response
	response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

	return response_text

	def process_image_and_query(uploaded_file, user_query):
	"""
	Process the uploaded image and user query to generate a response from the model.

	Parameters:
	- uploaded_file: The uploaded image file.
	- user_query: The user's question about the image.

	Returns:
	- str: The generated response from the model.
	"""

	# Encode the uploaded image
	encoded_image = input_image_setup(uploaded_file)

	# Generate response using the encoded image and user query
	response = generate_model_response(encoded_image, user_query)

	return response

	# Create Gradio interface
	iface = gr.Interface(
	fn=process_image_and_query,
	inputs=[
	gr.inputs.Image(type="file", label="Upload Image"),
	gr.inputs.Textbox(label="User Query", placeholder="Enter your question about the image...")
	],
	outputs="text",
	)

	# Launch the Gradio app
	iface.launch()