|
import gradio as gr |
|
from model_architecture import ImageCaptionGenerationWithAttention |
|
from transformers import BartForConditionalGeneration, BartTokenizer, ViTModel, ViTImageProcessor |
|
import torch |
|
from PIL import Image |
|
from dotenv import load_dotenv |
|
import os |
|
import traceback |
|
|
|
load_dotenv() |
|
HF_TOKEN = os.getenv('hf_token') |
|
|
|
|
|
class GenerateCaptions: |
|
def __init__(self): |
|
self.device = torch.device( |
|
"cuda" if torch.cuda.is_available() else "cpu") |
|
vit_model = ViTModel.from_pretrained( |
|
"google/vit-base-patch16-224", token=HF_TOKEN).to(self.device) |
|
bart_model = BartForConditionalGeneration.from_pretrained( |
|
"facebook/bart-base").to(self.device) |
|
self.processor = ViTImageProcessor.from_pretrained( |
|
"google/vit-base-patch16-224") |
|
self.tokenizer = BartTokenizer.from_pretrained("facebook/bart-base") |
|
self.model = ImageCaptionGenerationWithAttention( |
|
vit_model, bart_model, self.tokenizer) |
|
self.model.load_state_dict(torch.load( |
|
'image_captioning_model_state_dict.pt', map_location=self.device)) |
|
self.model.eval() |
|
|
|
def generate_caption(self, frame, max_length=50, num_beams=5): |
|
try: |
|
image_pixel_values = self.processor( |
|
frame, return_tensors="pt").pixel_values |
|
generated_caption_ids = self.model.generate( |
|
image_pixel_values, max_length, num_beams) |
|
return self.tokenizer.decode(generated_caption_ids[0], skip_special_tokens=True) |
|
except Exception as e: |
|
print(e) |
|
print(traceback.format_exc()) |
|
|
|
|
|
gc = GenerateCaptions() |
|
|
|
demo = gr.Interface( |
|
fn=gc.generate_caption, |
|
inputs=gr.Image(type='pil'), |
|
outputs="text", |
|
title="Image Caption Generation", |
|
examples=['Image.jpg', 'Image 2.jpg'], |
|
submit_btn='Generate Caption', |
|
flagging_mode='never' |
|
) |
|
|
|
|
|
demo.launch() |
|
|