import sys # if 'google.colab' in sys.modules: # print('Running in Colab.') # !pip3 install transformers==4.15.0 timm==0.4.12 fairscale==0.4.4 # !git clone https://github.com/salesforce/BLIP # %cd BLIP import gradio as gr import torch import requests from torchvision import transforms from PIL import Image import requests import torch from torchvision import transforms from torchvision.transforms.functional import InterpolationMode #@title device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = torch.hub.load('pytorch/vision:v0.6.0', 'resnet18', pretrained=True).eval() response = requests.get("https://git.io/JJkYN") labels = response.text.split("\n") def predict(inp): inp = transforms.ToTensor()(inp).unsqueeze(0) with torch.no_grad(): prediction = torch.nn.functional.softmax(model(inp)[0], dim=0) confidences = {labels[i]: float(prediction[i]) for i in range(1000)} return confidences demo = gr.Interface(fn=predict, inputs=gr.inputs.Image(type="pil"), outputs=gr.outputs.Label(num_top_classes=3) ) def load_demo_image(image_size,device,imageurl): img_url = imageurl raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB') w,h = raw_image.size display(raw_image.resize((w//5,h//5))) transform = transforms.Compose([ transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC), transforms.ToTensor(), transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)) ]) image = transform(raw_image).unsqueeze(0).to(device) return image from models.blip import blip_decoder def predict(imageurl): image_size = 384 image = load_demo_image(image_size=image_size, device=device,imageurl=imageurl) model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth' model = blip_decoder(pretrained=model_url, image_size=image_size, vit='base') model.eval() model = model.to(device) with torch.no_grad(): # beam search caption = model.generate(image, sample=False, num_beams=3, max_length=20, min_length=5) # nucleus sampling # caption = model.generate(image, sample=True, top_p=0.9, max_length=20, min_length=5) return('caption: '+caption[0]) demo = gr.Interface(fn=predict, inputs="text", outputs=gr.outputs.Label(num_top_classes=3) ) demo.launch()