Spaces:

ikraamkb
/

Summarization

Running

App Files Files Community

ikraamkb commited on Apr 11

Commit

c55ca48

verified ·

1 Parent(s): b7dc4fe

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -17

app.py CHANGED Viewed

@@ -65,26 +65,26 @@ app = gr.mount_gradio_app(app, gui, path="/")
 @app.get("/")
 def home():
     return RedirectResponse(url="/") """
-from fastapi import FastAPI
-from fastapi.responses import RedirectResponse
-import tempfile
 from PIL import Image
-import torch
 from transformers import ViltProcessor, ViltForQuestionAnswering, AutoTokenizer, AutoModelForCausalLM
 from gtts import gTTS
 import gradio as gr
-from transformers import AutoModelForSeq2SeqLM
 app = FastAPI()
 # Load VQA Model
 vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
 vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
-# Load GPT model to rewrite answers
-gpt_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
-gpt_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
 def rewrite_answer(question, short_answer):
     prompt = (
@@ -98,7 +98,7 @@ def rewrite_answer(question, short_answer):
             **inputs,
             max_new_tokens=50,
             do_sample=True,
-            top_p=0.9,
             temperature=0.7,
             pad_token_id=gpt_tokenizer.eos_token_id
         )
@@ -122,17 +122,16 @@ def answer_question_from_image(image, question):
     predicted_id = outputs.logits.argmax(-1).item()
     short_answer = vqa_model.config.id2label[predicted_id]
-    # Rewrite to human-like sentence
     full_answer = rewrite_answer(question, short_answer)
-    # Convert to speech
     try:
         tts = gTTS(text=full_answer)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
             tts.save(tmp.name)
             audio_path = tmp.name
     except Exception as e:
-        return f"{full_answer}\n\n⚠️ Audio generation error: {e}", None
     return full_answer, audio_path
@@ -140,7 +139,6 @@ def process_image_question(image: Image.Image, question: str):
     answer, audio_path = answer_question_from_image(image, question)
     return answer, audio_path
-# Gradio UI
 gui = gr.Interface(
     fn=process_image_question,
     inputs=[
@@ -152,10 +150,9 @@ gui = gr.Interface(
         gr.Audio(label="Answer (Audio)", type="filepath")
     ],
     title="🧠 Image QA with Voice",
-    description="Upload an image and ask a question. You'll get a human-like spoken answer."
 )
-# Mount on FastAPI
 app = gr.mount_gradio_app(app, gui, path="/")
 @app.get("/")

 @app.get("/")
 def home():
     return RedirectResponse(url="/") """
+from fastapi import FastAPI, UploadFile, Form
+from fastapi.responses import RedirectResponse, FileResponse, JSONResponse
+import os
+import shutil
 from PIL import Image
 from transformers import ViltProcessor, ViltForQuestionAnswering, AutoTokenizer, AutoModelForCausalLM
 from gtts import gTTS
+import torch
+import tempfile
 import gradio as gr
 app = FastAPI()
 # Load VQA Model
 vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
 vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
+# Load GPT model to rewrite answers (Phi-1.5)
+gpt_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
+gpt_model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5")
 def rewrite_answer(question, short_answer):
     prompt = (
             **inputs,
             max_new_tokens=50,
             do_sample=True,
+            top_p=0.95,
             temperature=0.7,
             pad_token_id=gpt_tokenizer.eos_token_id
         )
     predicted_id = outputs.logits.argmax(-1).item()
     short_answer = vqa_model.config.id2label[predicted_id]
+    # Rewrite short answer to full sentence with Phi-1.5
     full_answer = rewrite_answer(question, short_answer)
     try:
         tts = gTTS(text=full_answer)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
             tts.save(tmp.name)
             audio_path = tmp.name
     except Exception as e:
+        return f"Answer: {full_answer}\n\n⚠️ Audio generation error: {e}", None
     return full_answer, audio_path
     answer, audio_path = answer_question_from_image(image, question)
     return answer, audio_path
 gui = gr.Interface(
     fn=process_image_question,
     inputs=[
         gr.Audio(label="Answer (Audio)", type="filepath")
     ],
     title="🧠 Image QA with Voice",
+    description="Upload an image and ask a question. You'll get a full-sentence spoken answer."
 )
 app = gr.mount_gradio_app(app, gui, path="/")
 @app.get("/")