Spaces:
Restarting
Restarting
from fastapi import FastAPI | |
from fastapi.responses import RedirectResponse | |
import gradio as gr | |
from PIL import Image | |
import numpy as np | |
from transformers import pipeline | |
from gtts import gTTS | |
import tempfile | |
import os | |
app = FastAPI() | |
# OCR Reader (lazy import inside function to avoid ImportError on Spaces) | |
ocr_reader = None | |
# Captioning and VQA Pipelines | |
caption_model = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") | |
vqa_model = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa") | |
def process_image_question(image: Image.Image, question: str): | |
if image is None: | |
return "No image uploaded.", None | |
try: | |
# Import EasyOCR only when needed | |
global ocr_reader | |
if ocr_reader is None: | |
import easyocr | |
ocr_reader = easyocr.Reader(['en'], gpu=False) | |
# Convert PIL image to numpy array | |
np_image = np.array(image) | |
# OCR extraction | |
ocr_texts = ocr_reader.readtext(np_image, detail=0) | |
extracted_text = "\n".join(ocr_texts) | |
# Generate caption | |
caption = caption_model(image)[0]['generated_text'] | |
# Ask question on image using VQA | |
vqa_result = vqa_model(image=image, question=question) | |
answer = vqa_result[0]['answer'] | |
# Combine results | |
final_output = f"πΌοΈ Caption: {caption}\n\nπ OCR Text:\n{extracted_text}\n\nβ Answer: {answer}" | |
# Convert answer to speech | |
tts = gTTS(text=answer) | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp: | |
tts.save(tmp.name) | |
audio_path = tmp.name | |
return final_output, audio_path | |
except Exception as e: | |
return f"β Error processing image: {e}", None | |
gui = gr.Interface( | |
fn=process_image_question, | |
inputs=[ | |
gr.Image(type="pil", label="Upload Image"), | |
gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question") | |
], | |
outputs=[ | |
gr.Textbox(label="Result", lines=10), | |
gr.Audio(label="Answer (Audio)", type="filepath") | |
], | |
title="π§ Image QA with Voice", | |
description="Upload an image and ask any question. Supports OCR, captioning, visual QA, and audio response." | |
) | |
app = gr.mount_gradio_app(app, gui, path="/") | |
def home(): | |
return RedirectResponse(url="/") | |