File size: 2,394 Bytes
95c2451
5b4fc38
3e87c53
fac31c8
 
653c3ae
fac31c8
 
 
e5b6ad2
b9ed063
 
1e83db4
a74f8b0
aa2a251
 
3fb07d9
fac31c8
 
 
cf9a79a
fac31c8
 
 
cf9a79a
af32fa4
aa2a251
 
 
 
 
 
fac31c8
 
cf9a79a
fac31c8
 
 
6dfac5c
fac31c8
 
3fb07d9
fac31c8
 
 
12d05c0
fac31c8
 
12d05c0
fac31c8
 
 
 
 
12d05c0
fac31c8
12d05c0
3e87c53
fac31c8
 
 
 
 
 
 
 
 
 
 
 
 
 
5b4fc38
 
fac31c8
3e87c53
5b4fc38
fac31c8
5b4fc38
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from fastapi import FastAPI
from fastapi.responses import RedirectResponse
import gradio as gr
from PIL import Image
import numpy as np
from transformers import pipeline
from gtts import gTTS
import tempfile
import os



app = FastAPI()

# OCR Reader (lazy import inside function to avoid ImportError on Spaces)
ocr_reader = None

# Captioning and VQA Pipelines
caption_model = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
vqa_model = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")

def process_image_question(image: Image.Image, question: str):
    if image is None:
        return "No image uploaded.", None

    try:
        # Import EasyOCR only when needed
        global ocr_reader
        if ocr_reader is None:
            import easyocr
            ocr_reader = easyocr.Reader(['en'], gpu=False)

        # Convert PIL image to numpy array
        np_image = np.array(image)

        # OCR extraction
        ocr_texts = ocr_reader.readtext(np_image, detail=0)
        extracted_text = "\n".join(ocr_texts)

        # Generate caption
        caption = caption_model(image)[0]['generated_text']

        # Ask question on image using VQA
        vqa_result = vqa_model(image=image, question=question)
        answer = vqa_result[0]['answer']

        # Combine results
        final_output = f"๐Ÿ–ผ๏ธ Caption: {caption}\n\n๐Ÿ“– OCR Text:\n{extracted_text}\n\nโ“ Answer: {answer}"

        # Convert answer to speech
        tts = gTTS(text=answer)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
            tts.save(tmp.name)
            audio_path = tmp.name

        return final_output, audio_path

    except Exception as e:
        return f"โŒ Error processing image: {e}", None

gui = gr.Interface(
    fn=process_image_question,
    inputs=[
        gr.Image(type="pil", label="Upload Image"),
        gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question")
    ],
    outputs=[
        gr.Textbox(label="Result", lines=10),
        gr.Audio(label="Answer (Audio)", type="filepath")
    ],
    title="๐Ÿง  Image QA with Voice",
    description="Upload an image and ask any question. Supports OCR, captioning, visual QA, and audio response."
)

app = gr.mount_gradio_app(app, gui, path="/")

@app.get("/")
def home():
    return RedirectResponse(url="/")