nielsr HF staff commited on
Commit
340acb8
·
1 Parent(s): b713d8f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -11
app.py CHANGED
@@ -25,13 +25,10 @@ vitgpt_processor = AutoImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image
25
  vitgpt_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
26
  vitgpt_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
27
 
28
- # not using CoCa for the moment as it requires too much RAM
29
- # filepath = hf_hub_download(repo_id="gpucce/CoCa", filename="laion2B-s13B-b90k-mscoco-2014.pt", repo_type="space")
30
-
31
- # coca_model, _, coca_transform = open_clip.create_model_and_transforms(
32
- # "coca_ViT-L-14",
33
- # pretrained=filepath,
34
- # )
35
 
36
  device = "cuda" if torch.cuda.is_available() else "cpu"
37
 
@@ -40,7 +37,7 @@ blip_model_base.to(device)
40
  git_model_large.to(device)
41
  blip_model_large.to(device)
42
  vitgpt_model.to(device)
43
- # coca_model.to(device)
44
 
45
  def generate_caption(processor, model, image, tokenizer=None):
46
  inputs = processor(images=image, return_tensors="pt").to(device)
@@ -72,13 +69,13 @@ def generate_captions(image):
72
 
73
  caption_vitgpt = generate_caption(vitgpt_processor, vitgpt_model, image, vitgpt_tokenizer)
74
 
75
- # caption_coca = generate_caption_coca(coca_model, coca_transform, image)
76
 
77
- return caption_git_base, caption_git_large, caption_blip_base, caption_blip_large, caption_vitgpt
78
 
79
 
80
  examples = [["cats.jpg"], ["stop_sign.png"], ["astronaut.jpg"]]
81
- outputs = [gr.outputs.Textbox(label="Caption generated by GIT-base"), gr.outputs.Textbox(label="Caption generated by GIT-large"), gr.outputs.Textbox(label="Caption generated by BLIP-base"), gr.outputs.Textbox(label="Caption generated by BLIP-large"), gr.outputs.Textbox(label="Caption generated by ViT+GPT-2")]
82
 
83
  title = "Interactive demo: comparing image captioning models"
84
  description = "Gradio Demo to compare GIT, BLIP and ViT+GPT2, 3 state-of-the-art vision+language models. To use it, simply upload your image and click 'submit', or click one of the examples to load them. Read more at the links below."
 
25
  vitgpt_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
26
  vitgpt_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
27
 
28
+ coca_model, _, coca_transform = open_clip.create_model_and_transforms(
29
+ model_name="coca_ViT-L-14",
30
+ pretrained="mscoco_finetuned_laion2B-s13B-b90k"
31
+ )
 
 
 
32
 
33
  device = "cuda" if torch.cuda.is_available() else "cpu"
34
 
 
37
  git_model_large.to(device)
38
  blip_model_large.to(device)
39
  vitgpt_model.to(device)
40
+ coca_model.to(device)
41
 
42
  def generate_caption(processor, model, image, tokenizer=None):
43
  inputs = processor(images=image, return_tensors="pt").to(device)
 
69
 
70
  caption_vitgpt = generate_caption(vitgpt_processor, vitgpt_model, image, vitgpt_tokenizer)
71
 
72
+ caption_coca = generate_caption_coca(coca_model, coca_transform, image)
73
 
74
+ return caption_git_base, caption_git_large, caption_blip_base, caption_blip_large, caption_vitgpt, caption_coca
75
 
76
 
77
  examples = [["cats.jpg"], ["stop_sign.png"], ["astronaut.jpg"]]
78
+ outputs = [gr.outputs.Textbox(label="Caption generated by GIT-base"), gr.outputs.Textbox(label="Caption generated by GIT-large"), gr.outputs.Textbox(label="Caption generated by BLIP-base"), gr.outputs.Textbox(label="Caption generated by BLIP-large"), gr.outputs.Textbox(label="Caption generated by ViT+GPT-2"), gr.outputs.Textbox(label="Caption generated by CoCa")]
79
 
80
  title = "Interactive demo: comparing image captioning models"
81
  description = "Gradio Demo to compare GIT, BLIP and ViT+GPT2, 3 state-of-the-art vision+language models. To use it, simply upload your image and click 'submit', or click one of the examples to load them. Read more at the links below."