import torch import torch.nn as nn import cv2 import gradio as gr import numpy as np from PIL import Image import transformers from transformers import RobertaModel, RobertaTokenizer import timm import pandas as pd import matplotlib.pyplot as plt from timm.data import resolve_data_config from timm.data.transforms_factory import create_transform from model import Model from output import visualize_output # Use GPU if available device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Initialize used pretrained models vit = timm.create_model('vit_base_patch16_224', pretrained=True, num_classes=0, global_pool='').to(device) tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True) roberta = RobertaModel.from_pretrained("roberta-base") model = Model(vit, roberta, tokenizer, device).to(device) model.eval() # Initialize trained model state = torch.load('saved_model', map_location=torch.device('cpu')) model.load_state_dict(state['val_model_dict']) # Create transform for input image config = resolve_data_config({}, model=vit) config['no_aug'] = True config['interpolation'] = 'bilinear' # Inference function def query_image(input_img, query, binarize, eval_threshold, crop_mode, crop_pct): if crop_mode == 'center': crop_mode = None config['crop_pct'] = crop_pct config['crop_mode'] = crop_mode transform = create_transform(**config) PIL_image = Image.fromarray(input_img, "RGB") img = transform(PIL_image) img = torch.unsqueeze(img,0).to(device) with torch.no_grad(): output = model(img, query) img = visualize_output(img, output, binarize, eval_threshold) return img # Gradio interface description = """ Gradio demo for an object detection architecture, introduced in my bachelor thesis (link will be added). \n\n You can use this architecture to detect objects using textual queries. To use it, simply upload an image and enter any query you want. It can be a single word or a sentence. The model is trained to recognize only 80 categories (classes) from the COCO Detection 2017 dataset. Refer to this website or the original COCO paper to see the full list of categories. \n\n Best results are obtained using one of these sentences, which were used during training: