Spaces:

xxx1
/

VQA_CAP_GPT

Runtime error

App Files Files Community

xxx1 commited on Mar 9, 2023

Commit

764d4c7

1 Parent(s): 25c7e03

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -12

app.py CHANGED Viewed

@@ -5,23 +5,23 @@ import torch
 from models.VLE import VLEForVQA, VLEProcessor, VLEForVQAPipeline
 from PIL import Image
-model_name="hfl/vle-base-for-vqa"
-model = VLEForVQA.from_pretrained(model_name)
-vle_processor = VLEProcessor.from_pretrained(model_name)
-vqa_pipeline = VLEForVQAPipeline(model=model, device='cpu', vle_processor=vle_processor)
 from transformers import BlipForQuestionAnswering, BlipProcessor
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
-model_vqa = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large").to(device)
 from transformers import BlipProcessor, BlipForConditionalGeneration
-cap_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
-cap_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
@@ -128,7 +128,7 @@ def inference_chat(input_image,input_text):
 title = """# VQA with VLE and LLM"""
 description = """We demonstrate three visual question answering systems built with VLE and LLM:
-* VQA: The image and the question are fed into a VQA model (VLEForVQA) and the model predicts the answer.
 * VQA + LLM (short answer): The captioning model generates a caption of the image. We feed the caption, the question, and the answer candidates predicted by the VQA model to the LLM, and ask the LLM to select the most reasonable answer from the candidates.
@@ -168,8 +168,8 @@ with gr.Blocks(
                         )
                         '''
         with gr.Column():
-            caption_output = gr.Textbox(lines=0, label="VQA ")
-            caption_output_v1 = gr.Textbox(lines=0, label="VQA+LLM (short answer)")
             gpt3_output_v1 = gr.Textbox(lines=0, label="VQA+LLM (long answer)")

 from models.VLE import VLEForVQA, VLEProcessor, VLEForVQAPipeline
 from PIL import Image
+# model_name="hfl/vle-base-for-vqa"
+# model = VLEForVQA.from_pretrained(model_name)
+# vle_processor = VLEProcessor.from_pretrained(model_name)
+# vqa_pipeline = VLEForVQAPipeline(model=model, device='cpu', vle_processor=vle_processor)
 from transformers import BlipForQuestionAnswering, BlipProcessor
+# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
+# model_vqa = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large").to(device)
 from transformers import BlipProcessor, BlipForConditionalGeneration
+# cap_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+# cap_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
 title = """# VQA with VLE and LLM"""
 description = """We demonstrate three visual question answering systems built with VLE and LLM:
 * VQA + LLM (short answer): The captioning model generates a caption of the image. We feed the caption, the question, and the answer candidates predicted by the VQA model to the LLM, and ask the LLM to select the most reasonable answer from the candidates.
                         )
                         '''
         with gr.Column():
+            caption_output = gr.Textbox(lines=0, label="* VQA: The image and the question are fed into a VQA model (VLEForVQA) and the model predicts the answer.")
+            caption_output_v1 = gr.Textbox(lines=0, label="* VQA + LLM (long answer): The pipeline is the same as VQA + LLM (short answer), except that the answer is freely generated by the LLM and not limited to VQA candidates.")
             gpt3_output_v1 = gr.Textbox(lines=0, label="VQA+LLM (long answer)")