from fastapi import FastAPI from fastapi.responses import RedirectResponse import gradio as gr from PIL import Image import numpy as np from transformers import pipeline from gtts import gTTS import tempfile import os app = FastAPI() # OCR Reader (lazy import inside function to avoid ImportError on Spaces) ocr_reader = None # Captioning and VQA Pipelines caption_model = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") vqa_model = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa") def process_image_question(image: Image.Image, question: str): if image is None: return "No image uploaded.", None try: # Import EasyOCR only when needed global ocr_reader if ocr_reader is None: import easyocr ocr_reader = easyocr.Reader(['en'], gpu=False) # Convert PIL image to numpy array np_image = np.array(image) # OCR extraction ocr_texts = ocr_reader.readtext(np_image, detail=0) extracted_text = "\n".join(ocr_texts) # Generate caption caption = caption_model(image)[0]['generated_text'] # Ask question on image using VQA vqa_result = vqa_model(image=image, question=question) answer = vqa_result[0]['answer'] # Combine results final_output = f"šŸ–¼ļø Caption: {caption}\n\nšŸ“– OCR Text:\n{extracted_text}\n\nā“ Answer: {answer}" # Convert answer to speech tts = gTTS(text=answer) with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp: tts.save(tmp.name) audio_path = tmp.name return final_output, audio_path except Exception as e: return f"āŒ Error processing image: {e}", None gui = gr.Interface( fn=process_image_question, inputs=[ gr.Image(type="pil", label="Upload Image"), gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question") ], outputs=[ gr.Textbox(label="Result", lines=10), gr.Audio(label="Answer (Audio)", type="filepath") ], title="🧠 Image QA with Voice", description="Upload an image and ask any question. Supports OCR, captioning, visual QA, and audio response." ) app = gr.mount_gradio_app(app, gui, path="/") @app.get("/") def home(): return RedirectResponse(url="/")