Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from sentence_transformers import SentenceTransformer | |
| from transformers import AutoProcessor, AutoModelForVision2Seq | |
| from PIL import Image | |
| import torch | |
| from torchvision import io | |
| from typing import Dict | |
| from datetime import datetime | |
| import numpy as np | |
| import base64 | |
| import os, stat, io | |
| # Load the model in half-precision on the available device(s) | |
| model = AutoModelForVision2Seq.from_pretrained( | |
| "./SmolVLM-500M-Instruct", | |
| torch_dtype=torch.float32, | |
| _attn_implementation="eager", | |
| device_map="cpu" | |
| ) | |
| processor = AutoProcessor.from_pretrained("./SmolVLM-500M-Instruct") | |
| def array_to_image(image_array): | |
| if image_array is None: | |
| raise ValueError("No image provided. Please upload an image before submitting.") | |
| # Convert numpy array to PIL Image | |
| image = Image.fromarray(np.uint8(image_array)).convert("RGB") | |
| return image | |
| def generate_embeddings(text): | |
| model = SentenceTransformer('./all-MiniLM-L6-v2') | |
| embeddings = model.encode(text) | |
| return embeddings | |
| def describe_image(image_array): | |
| image = array_to_image(image_array) | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "image", | |
| }, | |
| {"type": "text", "text": "Make a very detailed description of the image."}, | |
| ], | |
| } | |
| ] | |
| prompt = processor.apply_chat_template(messages, add_generation_prompt=True) | |
| inputs = processor(text=prompt, images=[image], return_tensors="pt") | |
| # Inference: Generation of the output | |
| with torch.no_grad(): | |
| generated_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=500, | |
| num_beams=1, # Disable beam search | |
| do_sample=False, # Disable sampling | |
| #temperature=1.0 # Set temperature to 1.0 | |
| ) | |
| output_ids = [ | |
| generated_ids[len(input_ids) :] | |
| for input_ids, generated_ids in zip(inputs.input_ids, generated_ids) | |
| ] | |
| output_text = processor.batch_decode( | |
| output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True | |
| ) | |
| # Extract the detailed description from the response | |
| return output_text, generate_embeddings(output_text) | |
| # Create a Gradio interface | |
| iface = gr.Interface( | |
| fn=describe_image, | |
| inputs=gr.Image(), | |
| outputs=[gr.Textbox(label="Description"), gr.JSON(label="Embeddings")], | |
| title="Image Description with SmolVLM-500M-Instruct and Textual embeddings with all-MiniLM-L6-v2", | |
| description="Upload an image to get a detailed description using the SmolVLM-500M-Instruct model." | |
| ) | |
| # Launch the app | |
| #iface.launch(share=True) | |
| iface.launch() |