Summarization / app.py
ikraamkb's picture
Update app.py
05e0b44 verified
raw
history blame
2.06 kB
from fastapi import FastAPI
from fastapi.responses import RedirectResponse
import gradio as gr
from PIL import Image
import numpy as np
from transformers import pipeline
from gtts import gTTS
import tempfile
import os
import pytesseract # βœ… Replacing easyocr
app = FastAPI()
# Models
caption_model = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
vqa_model = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
def process_image_question(image: Image.Image, question: str):
if image is None:
return "No image uploaded.", None
try:
# Convert image to numpy
np_image = np.array(image)
# OCR text using pytesseract
extracted_text = pytesseract.image_to_string(image)
# Caption
caption = caption_model(image)[0]['generated_text']
# Visual QA
vqa_result = vqa_model(image=image, question=question)
answer = vqa_result[0]['answer']
# Answer as speech
tts = gTTS(text=answer)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
tts.save(tmp.name)
audio_path = tmp.name
final_output = f"πŸ–ΌοΈ Caption: {caption}\n\nπŸ“– OCR Text:\n{extracted_text}\n\n❓ Answer: {answer}"
return final_output, audio_path
except Exception as e:
return f"❌ Error: {e}", None
gui = gr.Interface(
fn=process_image_question,
inputs=[
gr.Image(type="pil", label="Upload Image"),
gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question")
],
outputs=[
gr.Textbox(label="Result", lines=10),
gr.Audio(label="Answer (Audio)", type="filepath")
],
title="🧠 Image QA with Voice",
description="Upload an image and ask any question β€” even if there's no readable text. The app will use OCR, captioning, visual QA, and read answers aloud."
)
app = gr.mount_gradio_app(app, gui, path="/")
@app.get("/")
def home():
return RedirectResponse(url="/")