Spaces:
Runtime error
Runtime error
File size: 3,143 Bytes
2773523 76c8f3a 2773523 3a227f4 2773523 075ee8a 2773523 c62a436 76c8f3a c62a436 a808d84 51d259a a808d84 76c8f3a 38ab1e3 2773523 a808d84 76c8f3a 2773523 76c8f3a c62a436 2773523 d1536f3 2773523 329d18e 37c781d 2773523 329d18e 2773523 c62a436 2773523 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import gradio as gr
from transformers import AutoProcessor, AutoTokenizer, AutoImageProcessor, AutoModelForCausalLM, BlipForConditionalGeneration, VisionEncoderDecoderModel
import torch
torch.hub.download_url_to_file('http://images.cocodataset.org/val2017/000000039769.jpg', 'cats.jpg')
torch.hub.download_url_to_file('https://huggingface.co/datasets/nielsr/textcaps-sample/resolve/main/stop_sign.png', 'stop_sign.png')
git_processor = AutoProcessor.from_pretrained("microsoft/git-base-coco")
git_model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")
blip_processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
vitgpt_processor = AutoImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
vitgpt_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
vitgpt_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
device = "cuda" if torch.cuda.is_available() else "cpu"
git_model.to(device)
blip_model.to(device)
vitgpt_model.to(device)
def generate_caption(processor, model, image, tokenizer=None):
inputs = processor(images=image, return_tensors="pt").to(device)
generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=50)
if tokenizer is not None:
generated_ids = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
else:
generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
return generated_caption
def generate_captions(image):
caption_git = generate_caption(git_processor, git_model, image)
caption_blip = generate_caption(blip_processor, blip_model, image)
caption_vitgpt = generate_caption(vitgpt_processor, vitgpt_model, image, vitgpt_tokenizer)
return caption_git, caption_blip, caption_vitgpt
examples = [["cats.jpg"], ["stop_sign.png"]]
title = "Interactive demo: comparing image captioning models"
description = "Gradio Demo to compare GIT, BLIP and ViT-2-GPT2, 3 state-of-the-art captioning models. To use it, simply upload your image and click 'submit', or click one of the examples to load them. Read more at the links below."
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2102.03334' target='_blank'>ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision</a> | <a href='https://github.com/dandelin/ViLT' target='_blank'>Github Repo</a></p>"
interface = gr.Interface(fn=generate_captions,
inputs=gr.inputs.Image(type="pil"),
outputs=[gr.outputs.Textbox(label="Caption generated by GIT"), gr.outputs.Textbox(label="Caption generated by BLIP"), gr.outputs.Textbox(label="Caption generated by ViT+GPT-2")],
examples=examples,
title=title,
description=description,
article=article,
enable_queue=True)
interface.launch(debug=True) |