from fastapi import FastAPI from fastapi.responses import RedirectResponse import gradio as gr from PIL import Image import numpy as np from transformers import pipeline from gtts import gTTS import tempfile import os import pytesseract # āœ… Replacing easyocr app = FastAPI() # Models caption_model = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") vqa_model = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa") def process_image_question(image: Image.Image, question: str): if image is None: return "No image uploaded.", None try: # Convert image to numpy np_image = np.array(image) # OCR text using pytesseract extracted_text = pytesseract.image_to_string(image) # Caption caption = caption_model(image)[0]['generated_text'] # Visual QA vqa_result = vqa_model(image=image, question=question) answer = vqa_result[0]['answer'] # Answer as speech tts = gTTS(text=answer) with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp: tts.save(tmp.name) audio_path = tmp.name final_output = f"šŸ–¼ļø Caption: {caption}\n\nšŸ“– OCR Text:\n{extracted_text}\n\nā“ Answer: {answer}" return final_output, audio_path except Exception as e: return f"āŒ Error: {e}", None gui = gr.Interface( fn=process_image_question, inputs=[ gr.Image(type="pil", label="Upload Image"), gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question") ], outputs=[ gr.Textbox(label="Result", lines=10), gr.Audio(label="Answer (Audio)", type="filepath") ], title="🧠 Image QA with Voice", description="Upload an image and ask any question — even if there's no readable text. The app will use OCR, captioning, visual QA, and read answers aloud." ) app = gr.mount_gradio_app(app, gui, path="/") @app.get("/") def home(): return RedirectResponse(url="/")