Summarization / app.py
ikraamkb's picture
Update app.py
aa2a251 verified
raw
history blame
2.39 kB
from fastapi import FastAPI
from fastapi.responses import RedirectResponse
import gradio as gr
from PIL import Image
import numpy as np
from transformers import pipeline
from gtts import gTTS
import tempfile
import os
app = FastAPI()
# OCR Reader (lazy import inside function to avoid ImportError on Spaces)
ocr_reader = None
# Captioning and VQA Pipelines
caption_model = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
vqa_model = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
def process_image_question(image: Image.Image, question: str):
if image is None:
return "No image uploaded.", None
try:
# Import EasyOCR only when needed
global ocr_reader
if ocr_reader is None:
import easyocr
ocr_reader = easyocr.Reader(['en'], gpu=False)
# Convert PIL image to numpy array
np_image = np.array(image)
# OCR extraction
ocr_texts = ocr_reader.readtext(np_image, detail=0)
extracted_text = "\n".join(ocr_texts)
# Generate caption
caption = caption_model(image)[0]['generated_text']
# Ask question on image using VQA
vqa_result = vqa_model(image=image, question=question)
answer = vqa_result[0]['answer']
# Combine results
final_output = f"πŸ–ΌοΈ Caption: {caption}\n\nπŸ“– OCR Text:\n{extracted_text}\n\n❓ Answer: {answer}"
# Convert answer to speech
tts = gTTS(text=answer)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
tts.save(tmp.name)
audio_path = tmp.name
return final_output, audio_path
except Exception as e:
return f"❌ Error processing image: {e}", None
gui = gr.Interface(
fn=process_image_question,
inputs=[
gr.Image(type="pil", label="Upload Image"),
gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question")
],
outputs=[
gr.Textbox(label="Result", lines=10),
gr.Audio(label="Answer (Audio)", type="filepath")
],
title="🧠 Image QA with Voice",
description="Upload an image and ask any question. Supports OCR, captioning, visual QA, and audio response."
)
app = gr.mount_gradio_app(app, gui, path="/")
@app.get("/")
def home():
return RedirectResponse(url="/")