File size: 2,058 Bytes
95c2451
5b4fc38
3e87c53
fac31c8
 
653c3ae
fac31c8
 
 
05e0b44
b9ed063
1e83db4
a74f8b0
b298682
fac31c8
 
05e0b44
cf9a79a
fac31c8
 
 
cf9a79a
af32fa4
b298682
fac31c8
cf9a79a
05e0b44
 
6dfac5c
b298682
fac31c8
3fb07d9
b298682
fac31c8
 
12d05c0
b298682
fac31c8
 
 
 
12d05c0
b298682
fac31c8
12d05c0
3e87c53
b298682
fac31c8
 
 
 
 
 
 
 
 
 
 
 
b298682
5b4fc38
 
fac31c8
3e87c53
5b4fc38
fac31c8
5b4fc38
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from fastapi import FastAPI
from fastapi.responses import RedirectResponse
import gradio as gr
from PIL import Image
import numpy as np
from transformers import pipeline
from gtts import gTTS
import tempfile
import os
import pytesseract  # βœ… Replacing easyocr

app = FastAPI()

# Models
caption_model = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
vqa_model = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")


def process_image_question(image: Image.Image, question: str):
    if image is None:
        return "No image uploaded.", None

    try:
        # Convert image to numpy
        np_image = np.array(image)

        # OCR text using pytesseract
        extracted_text = pytesseract.image_to_string(image)

        # Caption
        caption = caption_model(image)[0]['generated_text']

        # Visual QA
        vqa_result = vqa_model(image=image, question=question)
        answer = vqa_result[0]['answer']

        # Answer as speech
        tts = gTTS(text=answer)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
            tts.save(tmp.name)
            audio_path = tmp.name

        final_output = f"πŸ–ΌοΈ Caption: {caption}\n\nπŸ“– OCR Text:\n{extracted_text}\n\n❓ Answer: {answer}"
        return final_output, audio_path

    except Exception as e:
        return f"❌ Error: {e}", None

gui = gr.Interface(
    fn=process_image_question,
    inputs=[
        gr.Image(type="pil", label="Upload Image"),
        gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question")
    ],
    outputs=[
        gr.Textbox(label="Result", lines=10),
        gr.Audio(label="Answer (Audio)", type="filepath")
    ],
    title="🧠 Image QA with Voice",
    description="Upload an image and ask any question β€” even if there's no readable text. The app will use OCR, captioning, visual QA, and read answers aloud."
)

app = gr.mount_gradio_app(app, gui, path="/")

@app.get("/")
def home():
    return RedirectResponse(url="/")