Spaces:

ashish-001
/

ViT-BART-Based-Image-Captioning

Running

File size: 1,930 Bytes

906f611

import gradio as gr
from model_architecture import ImageCaptionGenerationWithAttention
from transformers import BartForConditionalGeneration, BartTokenizer, ViTModel, ViTImageProcessor
import torch
from PIL import Image
from dotenv import load_dotenv
import os
import traceback

load_dotenv()
HF_TOKEN = os.getenv('hf_token')


class GenerateCaptions:
    def __init__(self):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        vit_model = ViTModel.from_pretrained(
            "google/vit-base-patch16-224", token=HF_TOKEN).to(self.device)
        bart_model = BartForConditionalGeneration.from_pretrained(
            "facebook/bart-base").to(self.device)
        self.processor = ViTImageProcessor.from_pretrained(
            "google/vit-base-patch16-224")
        self.tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
        self.model = ImageCaptionGenerationWithAttention(
            vit_model, bart_model, self.tokenizer)
        self.model.load_state_dict(torch.load(
            'image_captioning_model_state_dict.pt', map_location=self.device))
        self.model.eval()

    def generate_caption(self, frame, max_length=50, num_beams=5):
        try:
            image_pixel_values = self.processor(
                frame, return_tensors="pt").pixel_values
            generated_caption_ids = self.model.generate(
                image_pixel_values, max_length, num_beams)
            return self.tokenizer.decode(generated_caption_ids[0], skip_special_tokens=True)
        except Exception as e:
            print(e)
            print(traceback.format_exc())


gc = GenerateCaptions()

demo = gr.Interface(
    fn=gc.generate_caption,
    inputs=gr.Image(type='pil'),
    outputs="text",
    title="Image Caption Generation",
    examples=['Image.jpg', 'Image 2.jpg'],
    submit_btn='Generate Caption',
    flagging_mode='never'
)


demo.launch()