import torch import gradio as gr from PIL import Image from transformers import AutoProcessor, AutoModel # Load the model and processor model_id = "OpenGVLab/InternVL2_5-78B" device = "cuda" if torch.cuda.is_available() else "cpu" # Initialize the model and processor model = AutoModel.from_pretrained( model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, use_flash_attn=True, trust_remote_code=True ).eval().to(device) processor = AutoProcessor.from_pretrained(model_id) def generate_model_response(image_file, user_query): """ Processes the uploaded image and user query to generate a response from the model. Parameters: - image_file: The uploaded image file. - user_query: The user's question about the image. Returns: - str: The generated response from the model. """ try: # Load and prepare the image raw_image = Image.open(image_file).convert("RGB") # Prepare inputs for the model using the processor inputs = processor(images=raw_image, text=user_query, return_tensors="pt").to(device) # Generate response from the model outputs = model.generate(**inputs) # Decode and return the response response_text = processor.decode(outputs[0], skip_special_tokens=True) return response_text except Exception as e: print(f"Error in generating response: {e}") return f"An error occurred: {str(e)}" # Gradio Interface iface = gr.Interface( fn=generate_model_response, inputs=[ gr.Image(type="file", label="Upload Image"), gr.Textbox(label="Enter your question", placeholder="What do you want to know about this image?") ], outputs="text", ) iface.launch(share=True)