Spaces:

Geraldine
/

Image-to-text-SmolVLM-for-Omeka

Sleeping

App Files Files Community

Geraldine commited on Jan 19

Commit

2ba0248

verified ·

1 Parent(s): 4e2506f

Create app.py

Browse files

Files changed (1) hide show

app.py +91 -0

app.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import gradio as gr
+from sentence_transformers import SentenceTransformer
+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from PIL import Image
+import torch
+from torchvision import io
+from typing import Dict
+from datetime import datetime
+import numpy as np
+import base64
+import os, io
+# Load the model in half-precision on the available device(s)
+model = Qwen2VLForConditionalGeneration.from_pretrained(
+    "./Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
+)
+processor = AutoProcessor.from_pretrained("./Qwen2-VL-7B-Instruct")
+def array_to_image_path(image_array):
+    if image_array is None:
+        raise ValueError("No image provided. Please upload an image before submitting.")
+    # Convert numpy array to PIL Image
+    img = Image.fromarray(np.uint8(image_array))
+    # Generate a unique filename using timestamp
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    filename = f"image_{timestamp}.png"
+    # Save the image
+    img.save(filename)
+    # Get the full path of the saved image
+    full_path = os.path.abspath(filename)
+    return full_path
+def generate_embeddings(text):
+    model = SentenceTransformer('./all-MiniLM-L6-v2')
+    embeddings = model.encode(sentences)
+    return embeddings
+def describe_image(image):
+    # Convert the image to the format expected by the model
+    image_path = array_to_image_path(image)
+    with open(image_path, "rb") as f:
+        image = base64.b64encode(f.read()).decode("utf-8")
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": f"data:image/png;base64,{image}"},
+                {"type": "text", "text": "Make a very detailed description of the image."},
+            ],
+        }
+    ]
+    text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+    # Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'
+    inputs = processor(
+        text=[text_prompt], images=[image], padding=True, return_tensors="pt"
+    )
+    inputs = inputs.to("cuda")
+    # Inference: Generation of the output
+    output_ids = model.generate(**inputs, max_new_tokens=128)
+    generated_ids = [
+        output_ids[len(input_ids) :]
+        for input_ids, output_ids in zip(inputs.input_ids, output_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
+    )
+    # remove image
+    os.remove(image_path)
+    # Extract the detailed description from the response
+    return output_text, generate_embeddings(output_text)
+# Create a Gradio interface
+iface = gr.Interface(
+    fn=describe_image,
+    inputs=gr.Image(),
+    outputs=[gr.Textbox(label="Description"), gr.JSON(label="Embeddings")],
+    title="Image Description with Qwen Model",
+    description="Upload an image to get a detailed description using the Qwen2-VL-7B-Instruct model."
+)
+# Launch the app
+#iface.launch(share=True)
+iface.launch()