Spaces:

ikraamkb
/

Summarization

Building

App Files Files Community

ikraamkb commited on 15 days ago

Commit

b0cc6e9

verified ·

1 Parent(s): c330600

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -16

app.py CHANGED Viewed

@@ -68,7 +68,7 @@ from fastapi.responses import RedirectResponse, FileResponse, JSONResponse
 import os
 import shutil
 from PIL import Image
-from transformers import ViltProcessor, ViltForQuestionAnswering, pipeline
 from gtts import gTTS
 import torch
 import tempfile
@@ -80,33 +80,37 @@ app = FastAPI()
 vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
 vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
-# Load GPT model for rewriting short answers
-gpt_rewriter = pipeline("text-generation", model="EleutherAI/gpt-neo-1.3B")
-def rewrite_answer(question: str, short_answer: str):
-    prompt = f"Q: {question}\nA: {short_answer}\n\nRespond with a full sentence:"
-    try:
-        result = gpt_rewriter(prompt, max_length=50, do_sample=False)
-        full_sentence = result[0]['generated_text'].split("Respond with a full sentence:")[-1].strip()
-        return full_sentence
-    except Exception as e:
-        return short_answer  # fallback
 def answer_question_from_image(image, question):
     if image is None or not question.strip():
         return "Please upload an image and ask a question.", None
-    # Process with model
     inputs = vqa_processor(image, question, return_tensors="pt")
     with torch.no_grad():
         outputs = vqa_model(**inputs)
     predicted_id = outputs.logits.argmax(-1).item()
     short_answer = vqa_model.config.id2label[predicted_id]
-    # Rewrite short answer using GPT
-    full_answer = rewrite_answer(question, short_answer)
-    # Generate TTS audio
     try:
         tts = gTTS(text=full_answer)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
@@ -132,7 +136,7 @@ gui = gr.Interface(
         gr.Audio(label="Answer (Audio)", type="filepath")
     ],
     title="🧠 Image QA with Voice",
-    description="Upload an image and ask a question. You'll get a detailed text + spoken answer."
 )
 app = gr.mount_gradio_app(app, gui, path="/")

 import os
 import shutil
 from PIL import Image
+from transformers import ViltProcessor, ViltForQuestionAnswering, AutoTokenizer, AutoModelForCausalLM
 from gtts import gTTS
 import torch
 import tempfile
 vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
 vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
+# Load GPT model to rewrite answers
+gpt_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
+gpt_model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
+def rewrite_answer(question):
+    prompt = f"{question}\nAnswer with a full sentence:"
+    inputs = gpt_tokenizer(prompt, return_tensors="pt")
+    with torch.no_grad():
+        outputs = gpt_model.generate(
+            **inputs,
+            max_new_tokens=40,
+            do_sample=False,
+            pad_token_id=gpt_tokenizer.eos_token_id
+        )
+    generated = gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
+    rewritten = generated.split(":")[-1].strip()
+    return rewritten
 def answer_question_from_image(image, question):
     if image is None or not question.strip():
         return "Please upload an image and ask a question.", None
     inputs = vqa_processor(image, question, return_tensors="pt")
     with torch.no_grad():
         outputs = vqa_model(**inputs)
     predicted_id = outputs.logits.argmax(-1).item()
     short_answer = vqa_model.config.id2label[predicted_id]
+    # Rewrite short answer to full sentence with GPT-Neo
+    full_answer = rewrite_answer(f"Question: {question}\nAnswer: {short_answer}")
     try:
         tts = gTTS(text=full_answer)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
         gr.Audio(label="Answer (Audio)", type="filepath")
     ],
     title="🧠 Image QA with Voice",
+    description="Upload an image and ask a question. You'll get a full-sentence spoken answer."
 )
 app = gr.mount_gradio_app(app, gui, path="/")