import gradio as gr from sentence_transformers import SentenceTransformer from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor from PIL import Image import torch from torchvision import io from typing import Dict from datetime import datetime import numpy as np import base64 import os, io # Load the model in half-precision on the available device(s) model = Qwen2VLForConditionalGeneration.from_pretrained( "./Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto" ) processor = AutoProcessor.from_pretrained("./Qwen2-VL-7B-Instruct") def array_to_image_path(image_array): if image_array is None: raise ValueError("No image provided. Please upload an image before submitting.") # Convert numpy array to PIL Image img = Image.fromarray(np.uint8(image_array)) # Generate a unique filename using timestamp timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"image_{timestamp}.png" # Save the image img.save(filename) # Get the full path of the saved image full_path = os.path.abspath(filename) return full_path def generate_embeddings(text): model = SentenceTransformer('./all-MiniLM-L6-v2') embeddings = model.encode(sentences) return embeddings def describe_image(image): # Convert the image to the format expected by the model image_path = array_to_image_path(image) with open(image_path, "rb") as f: image = base64.b64encode(f.read()).decode("utf-8") messages = [ { "role": "user", "content": [ {"type": "image", "image": f"data:image/png;base64,{image}"}, {"type": "text", "text": "Make a very detailed description of the image."}, ], } ] text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) # Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n' inputs = processor( text=[text_prompt], images=[image], padding=True, return_tensors="pt" ) inputs = inputs.to("cuda") # Inference: Generation of the output output_ids = model.generate(**inputs, max_new_tokens=128) generated_ids = [ output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, output_ids) ] output_text = processor.batch_decode( generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True ) # remove image os.remove(image_path) # Extract the detailed description from the response return output_text, generate_embeddings(output_text) # Create a Gradio interface iface = gr.Interface( fn=describe_image, inputs=gr.Image(), outputs=[gr.Textbox(label="Description"), gr.JSON(label="Embeddings")], title="Image Description with Qwen Model", description="Upload an image to get a detailed description using the Qwen2-VL-7B-Instruct model." ) # Launch the app #iface.launch(share=True) iface.launch()