Spaces:

Geraldine
/

Image-to-text-SmolVLM-for-Omeka

Sleeping

App Files Files Community

Image-to-text-SmolVLM-for-Omeka / app.py

Geraldine

Update app.py

398493c verified about 1 month ago

raw

history blame contribute delete

2.69 kB

	import gradio as gr
	from sentence_transformers import SentenceTransformer
	from transformers import AutoProcessor, AutoModelForVision2Seq
	from PIL import Image
	import torch
	from torchvision import io
	from typing import Dict
	from datetime import datetime
	import numpy as np
	import base64
	import os, stat, io

	# Load the model in half-precision on the available device(s)
	model = AutoModelForVision2Seq.from_pretrained(
	"./SmolVLM-500M-Instruct",
	torch_dtype=torch.float32,
	_attn_implementation="eager",
	device_map="cpu"
	)
	processor = AutoProcessor.from_pretrained("./SmolVLM-500M-Instruct")

	def array_to_image(image_array):
	if image_array is None:
	raise ValueError("No image provided. Please upload an image before submitting.")
	# Convert numpy array to PIL Image
	image = Image.fromarray(np.uint8(image_array)).convert("RGB")

	return image

	def generate_embeddings(text):
	model = SentenceTransformer('./all-MiniLM-L6-v2')
	embeddings = model.encode(text)
	return embeddings

	def describe_image(image_array):
	image = array_to_image(image_array)

	messages = [
	{
	"role": "user",
	"content": [
	{
	"type": "image",
	},
	{"type": "text", "text": "Make a very detailed description of the image."},
	],
	}
	]

	prompt = processor.apply_chat_template(messages, add_generation_prompt=True)

	inputs = processor(text=prompt, images=[image], return_tensors="pt")

	# Inference: Generation of the output
	with torch.no_grad():
	generated_ids = model.generate(
	**inputs,
	max_new_tokens=500,
	num_beams=1, # Disable beam search
	do_sample=False, # Disable sampling
	#temperature=1.0 # Set temperature to 1.0
	)
	output_ids = [
	generated_ids[len(input_ids) :]
	for input_ids, generated_ids in zip(inputs.input_ids, generated_ids)
	]
	output_text = processor.batch_decode(
	output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
	)
	# Extract the detailed description from the response
	return output_text, generate_embeddings(output_text)

	# Create a Gradio interface
	iface = gr.Interface(
	fn=describe_image,
	inputs=gr.Image(),
	outputs=[gr.Textbox(label="Description"), gr.JSON(label="Embeddings")],
	title="Image Description with SmolVLM-500M-Instruct and Textual embeddings with all-MiniLM-L6-v2",
	description="Upload an image to get a detailed description using the SmolVLM-500M-Instruct model."
	)

	# Launch the app
	#iface.launch(share=True)
	iface.launch()