xxx1 commited on
Commit
764d4c7
·
1 Parent(s): 25c7e03

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -12
app.py CHANGED
@@ -5,23 +5,23 @@ import torch
5
  from models.VLE import VLEForVQA, VLEProcessor, VLEForVQAPipeline
6
  from PIL import Image
7
 
8
- model_name="hfl/vle-base-for-vqa"
9
- model = VLEForVQA.from_pretrained(model_name)
10
- vle_processor = VLEProcessor.from_pretrained(model_name)
11
- vqa_pipeline = VLEForVQAPipeline(model=model, device='cpu', vle_processor=vle_processor)
12
 
13
 
14
  from transformers import BlipForQuestionAnswering, BlipProcessor
15
 
16
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
17
 
18
- processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
19
- model_vqa = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large").to(device)
20
 
21
  from transformers import BlipProcessor, BlipForConditionalGeneration
22
 
23
- cap_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
24
- cap_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
25
 
26
 
27
 
@@ -128,7 +128,7 @@ def inference_chat(input_image,input_text):
128
  title = """# VQA with VLE and LLM"""
129
  description = """We demonstrate three visual question answering systems built with VLE and LLM:
130
 
131
- * VQA: The image and the question are fed into a VQA model (VLEForVQA) and the model predicts the answer.
132
 
133
  * VQA + LLM (short answer): The captioning model generates a caption of the image. We feed the caption, the question, and the answer candidates predicted by the VQA model to the LLM, and ask the LLM to select the most reasonable answer from the candidates.
134
 
@@ -168,8 +168,8 @@ with gr.Blocks(
168
  )
169
  '''
170
  with gr.Column():
171
- caption_output = gr.Textbox(lines=0, label="VQA ")
172
- caption_output_v1 = gr.Textbox(lines=0, label="VQA+LLM (short answer)")
173
 
174
  gpt3_output_v1 = gr.Textbox(lines=0, label="VQA+LLM (long answer)")
175
 
 
5
  from models.VLE import VLEForVQA, VLEProcessor, VLEForVQAPipeline
6
  from PIL import Image
7
 
8
+ # model_name="hfl/vle-base-for-vqa"
9
+ # model = VLEForVQA.from_pretrained(model_name)
10
+ # vle_processor = VLEProcessor.from_pretrained(model_name)
11
+ # vqa_pipeline = VLEForVQAPipeline(model=model, device='cpu', vle_processor=vle_processor)
12
 
13
 
14
  from transformers import BlipForQuestionAnswering, BlipProcessor
15
 
16
+ # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
17
 
18
+ # processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
19
+ # model_vqa = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large").to(device)
20
 
21
  from transformers import BlipProcessor, BlipForConditionalGeneration
22
 
23
+ # cap_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
24
+ # cap_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
25
 
26
 
27
 
 
128
  title = """# VQA with VLE and LLM"""
129
  description = """We demonstrate three visual question answering systems built with VLE and LLM:
130
 
131
+
132
 
133
  * VQA + LLM (short answer): The captioning model generates a caption of the image. We feed the caption, the question, and the answer candidates predicted by the VQA model to the LLM, and ask the LLM to select the most reasonable answer from the candidates.
134
 
 
168
  )
169
  '''
170
  with gr.Column():
171
+ caption_output = gr.Textbox(lines=0, label="* VQA: The image and the question are fed into a VQA model (VLEForVQA) and the model predicts the answer.")
172
+ caption_output_v1 = gr.Textbox(lines=0, label="* VQA + LLM (long answer): The pipeline is the same as VQA + LLM (short answer), except that the answer is freely generated by the LLM and not limited to VQA candidates.")
173
 
174
  gpt3_output_v1 = gr.Textbox(lines=0, label="VQA+LLM (long answer)")
175