import re import base64 import io import torch import gradio as gr from PIL import Image from transformers import MllamaForConditionalGeneration, AutoProcessor # Load the model and processor model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct" model = MllamaForConditionalGeneration.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="auto", ) processor = AutoProcessor.from_pretrained(model_id) def generate_model_response(image_file, user_query): """ Processes the uploaded image and user query to generate a response from the model. Parameters: - image_file: The uploaded image file. - user_query: The user's question about the image. Returns: - str: The generated response from the model, formatted as HTML. """ try: # Load and prepare the image raw_image = Image.open(image_file).convert("RGB") # Prepare input for the model using the processor conversation = [ { "role": "user", "content": [ {"type": "image", "url": "<|image|>"}, # Placeholder for image {"type": "text", "text": user_query} ] } ] # Apply chat template to prepare inputs for the model inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) # Process the image and text inputs together inputs = processor(inputs, raw_image, return_tensors="pt").to(model.device) # Generate response from the model outputs = model.generate(**inputs) # Decode and format the response generated_text = processor.decode(outputs[0], skip_special_tokens=True) return generated_text except Exception as e: print(f"Error in generating response: {e}") return f"

An error occurred: {str(e)}

" # Gradio Interface iface = gr.Interface( fn=generate_model_response, inputs=[ gr.Image(type="file", label="Upload Image"), gr.Textbox(label="Enter your question", placeholder="How many calories are in this food?") ], outputs=gr.HTML(label="Response from Model"), ) iface.launch(share=True)