from fastapi import FastAPI
from fastapi.responses import RedirectResponse
import gradio as gr
from PIL import Image
import numpy as np
from transformers import pipeline
from gtts import gTTS
import tempfile
import os
import pytesseract  # ✅ Replacing easyocr

app = FastAPI()

# Models
caption_model = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
vqa_model = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")


def process_image_question(image: Image.Image, question: str):
    if image is None:
        return "No image uploaded.", None

    try:
        # Convert image to numpy
        np_image = np.array(image)

        # OCR text using pytesseract
        extracted_text = pytesseract.image_to_string(image)

        # Caption
        caption = caption_model(image)[0]['generated_text']

        # Visual QA
        vqa_result = vqa_model(image=image, question=question)
        answer = vqa_result[0]['answer']

        # Answer as speech
        tts = gTTS(text=answer)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
            tts.save(tmp.name)
            audio_path = tmp.name

        final_output = f"🖼️ Caption: {caption}\n\n📖 OCR Text:\n{extracted_text}\n\n❓ Answer: {answer}"
        return final_output, audio_path

    except Exception as e:
        return f"❌ Error: {e}", None

gui = gr.Interface(
    fn=process_image_question,
    inputs=[
        gr.Image(type="pil", label="Upload Image"),
        gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question")
    ],
    outputs=[
        gr.Textbox(label="Result", lines=10),
        gr.Audio(label="Answer (Audio)", type="filepath")
    ],
    title="🧠 Image QA with Voice",
    description="Upload an image and ask any question — even if there's no readable text. The app will use OCR, captioning, visual QA, and read answers aloud."
)

app = gr.mount_gradio_app(app, gui, path="/")

@app.get("/")
def home():
    return RedirectResponse(url="/")