Spaces:

ikraamkb
/

Summarization

Running

App Files Files Community

ikraamkb commited on 15 days ago

Commit

f94fa3b

verified ·

1 Parent(s): 607327a

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -36

app.py CHANGED Viewed

@@ -1,52 +1,48 @@
-from fastapi import FastAPI
-from fastapi.responses import RedirectResponse
-import gradio as gr
 from PIL import Image
-import numpy as np
-from transformers import pipeline
 from gtts import gTTS
 import tempfile
-import os
-import pytesseract  # ✅ Replacing easyocr
 app = FastAPI()
-# Models
-caption_model = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
-vqa_model = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
-def process_image_question(image: Image.Image, question: str):
-    if image is None:
-        return "No image uploaded.", None
-    try:
-        # Convert image to numpy
-        np_image = np.array(image)
-        # OCR text using pytesseract
-        extracted_text = pytesseract.image_to_string(image)
-        # Caption
-        caption = caption_model(image)[0]['generated_text']
-        # Visual QA
-        vqa_result = vqa_model(image=image, question=question)
-        answer = vqa_result[0]['answer']
-        # Answer as speech
         tts = gTTS(text=answer)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
             tts.save(tmp.name)
             audio_path = tmp.name
-        final_output = f"🖼️ Caption: {caption}\n\n📖 OCR Text:\n{extracted_text}\n\n❓ Answer: {answer}"
-        return final_output, audio_path
-    except Exception as e:
-        return f"❌ Error: {e}", None
 gui = gr.Interface(
     fn=process_image_question,
@@ -55,15 +51,15 @@ gui = gr.Interface(
         gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question")
     ],
     outputs=[
-        gr.Textbox(label="Result", lines=10),
         gr.Audio(label="Answer (Audio)", type="filepath")
     ],
     title="🧠 Image QA with Voice",
-    description="Upload an image and ask any question — even if there's no readable text. The app will use OCR, captioning, visual QA, and read answers aloud."
 )
 app = gr.mount_gradio_app(app, gui, path="/")
 @app.get("/")
 def home():
-    return RedirectResponse(url="/")

+from fastapi import FastAPI, UploadFile, Form
+from fastapi.responses import RedirectResponse, FileResponse, JSONResponse
+import os
+import shutil
 from PIL import Image
+from transformers import ViltProcessor, ViltForQuestionAnswering
 from gtts import gTTS
+import torch
 import tempfile
+import gradio as gr
 app = FastAPI()
+# Load VQA Model
+vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
+vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
+def answer_question_from_image(image, question):
+    if image is None or not question.strip():
+        return "Please upload an image and ask a question.", None
+    # Process with model
+    inputs = vqa_processor(image, question, return_tensors="pt")
+    with torch.no_grad():
+        outputs = vqa_model(**inputs)
+    predicted_id = outputs.logits.argmax(-1).item()
+    answer = vqa_model.config.id2label[predicted_id]
+    # Generate TTS audio
+    try:
         tts = gTTS(text=answer)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
             tts.save(tmp.name)
             audio_path = tmp.name
+    except Exception as e:
+        return f"Answer: {answer}\n\n⚠️ Audio generation error: {e}", None
+    return answer, audio_path
+def process_image_question(image: Image.Image, question: str):
+    answer, audio_path = answer_question_from_image(image, question)
+    return answer, audio_path
 gui = gr.Interface(
     fn=process_image_question,
         gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question")
     ],
     outputs=[
+        gr.Textbox(label="Answer", lines=5),
         gr.Audio(label="Answer (Audio)", type="filepath")
     ],
     title="🧠 Image QA with Voice",
+    description="Upload an image and ask a question. You'll get a text + spoken answer."
 )
 app = gr.mount_gradio_app(app, gui, path="/")
 @app.get("/")
 def home():
+    return RedirectResponse(url="/")