Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -5,23 +5,23 @@ import torch
|
|
5 |
from models.VLE import VLEForVQA, VLEProcessor, VLEForVQAPipeline
|
6 |
from PIL import Image
|
7 |
|
8 |
-
model_name="hfl/vle-base-for-vqa"
|
9 |
-
model = VLEForVQA.from_pretrained(model_name)
|
10 |
-
vle_processor = VLEProcessor.from_pretrained(model_name)
|
11 |
-
vqa_pipeline = VLEForVQAPipeline(model=model, device='cpu', vle_processor=vle_processor)
|
12 |
|
13 |
|
14 |
from transformers import BlipForQuestionAnswering, BlipProcessor
|
15 |
|
16 |
-
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
17 |
|
18 |
-
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
|
19 |
-
model_vqa = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large").to(device)
|
20 |
|
21 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
22 |
|
23 |
-
cap_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
|
24 |
-
cap_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
|
25 |
|
26 |
|
27 |
|
@@ -128,7 +128,7 @@ def inference_chat(input_image,input_text):
|
|
128 |
title = """# VQA with VLE and LLM"""
|
129 |
description = """We demonstrate three visual question answering systems built with VLE and LLM:
|
130 |
|
131 |
-
|
132 |
|
133 |
* VQA + LLM (short answer): The captioning model generates a caption of the image. We feed the caption, the question, and the answer candidates predicted by the VQA model to the LLM, and ask the LLM to select the most reasonable answer from the candidates.
|
134 |
|
@@ -168,8 +168,8 @@ with gr.Blocks(
|
|
168 |
)
|
169 |
'''
|
170 |
with gr.Column():
|
171 |
-
caption_output = gr.Textbox(lines=0, label="VQA ")
|
172 |
-
caption_output_v1 = gr.Textbox(lines=0, label="VQA+LLM (short answer)")
|
173 |
|
174 |
gpt3_output_v1 = gr.Textbox(lines=0, label="VQA+LLM (long answer)")
|
175 |
|
|
|
5 |
from models.VLE import VLEForVQA, VLEProcessor, VLEForVQAPipeline
|
6 |
from PIL import Image
|
7 |
|
8 |
+
# model_name="hfl/vle-base-for-vqa"
|
9 |
+
# model = VLEForVQA.from_pretrained(model_name)
|
10 |
+
# vle_processor = VLEProcessor.from_pretrained(model_name)
|
11 |
+
# vqa_pipeline = VLEForVQAPipeline(model=model, device='cpu', vle_processor=vle_processor)
|
12 |
|
13 |
|
14 |
from transformers import BlipForQuestionAnswering, BlipProcessor
|
15 |
|
16 |
+
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
17 |
|
18 |
+
# processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
|
19 |
+
# model_vqa = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large").to(device)
|
20 |
|
21 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
22 |
|
23 |
+
# cap_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
|
24 |
+
# cap_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
|
25 |
|
26 |
|
27 |
|
|
|
128 |
title = """# VQA with VLE and LLM"""
|
129 |
description = """We demonstrate three visual question answering systems built with VLE and LLM:
|
130 |
|
131 |
+
|
132 |
|
133 |
* VQA + LLM (short answer): The captioning model generates a caption of the image. We feed the caption, the question, and the answer candidates predicted by the VQA model to the LLM, and ask the LLM to select the most reasonable answer from the candidates.
|
134 |
|
|
|
168 |
)
|
169 |
'''
|
170 |
with gr.Column():
|
171 |
+
caption_output = gr.Textbox(lines=0, label="* VQA: The image and the question are fed into a VQA model (VLEForVQA) and the model predicts the answer.")
|
172 |
+
caption_output_v1 = gr.Textbox(lines=0, label="* VQA + LLM (long answer): The pipeline is the same as VQA + LLM (short answer), except that the answer is freely generated by the LLM and not limited to VQA candidates.")
|
173 |
|
174 |
gpt3_output_v1 = gr.Textbox(lines=0, label="VQA+LLM (long answer)")
|
175 |
|