File size: 2,199 Bytes
95c2451
5b4fc38
3e87c53
fac31c8
 
 
 
653c3ae
fac31c8
 
 
e5b6ad2
1e83db4
a74f8b0
fac31c8
 
3fb07d9
fac31c8
 
 
cf9a79a
fac31c8
 
 
cf9a79a
af32fa4
fac31c8
 
cf9a79a
fac31c8
 
 
6dfac5c
fac31c8
 
3fb07d9
fac31c8
 
 
12d05c0
fac31c8
 
12d05c0
fac31c8
 
 
 
 
12d05c0
fac31c8
12d05c0
3e87c53
fac31c8
 
 
 
 
 
 
 
 
 
 
 
 
 
5b4fc38
 
fac31c8
3e87c53
5b4fc38
fac31c8
5b4fc38
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from fastapi import FastAPI
from fastapi.responses import RedirectResponse
import gradio as gr
from PIL import Image
import io
import numpy as np
import easyocr
from transformers import pipeline
from gtts import gTTS
import tempfile
import os

app = FastAPI()

# OCR Reader
ocr_reader = easyocr.Reader(['en'], gpu=False)

# Captioning and VQA Pipelines
caption_model = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
vqa_model = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")

def process_image_question(image: Image.Image, question: str):
    if image is None:
        return "No image uploaded.", None

    try:
        # Convert PIL image to numpy array
        np_image = np.array(image)

        # OCR extraction
        ocr_texts = ocr_reader.readtext(np_image, detail=0)
        extracted_text = "\n".join(ocr_texts)

        # Generate caption
        caption = caption_model(image)[0]['generated_text']

        # Ask question on image using VQA
        vqa_result = vqa_model(image=image, question=question)
        answer = vqa_result[0]['answer']

        # Combine results
        final_output = f"๐Ÿ–ผ๏ธ Caption: {caption}\n\n๐Ÿ“– OCR Text:\n{extracted_text}\n\nโ“ Answer: {answer}"

        # Convert answer to speech
        tts = gTTS(text=answer)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
            tts.save(tmp.name)
            audio_path = tmp.name

        return final_output, audio_path

    except Exception as e:
        return f"โŒ Error processing image: {e}", None

gui = gr.Interface(
    fn=process_image_question,
    inputs=[
        gr.Image(type="pil", label="Upload Image"),
        gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question")
    ],
    outputs=[
        gr.Textbox(label="Result", lines=10),
        gr.Audio(label="Answer (Audio)", type="filepath")
    ],
    title="๐Ÿง  Image QA with Voice",
    description="Upload an image and ask any question. Supports OCR, captioning, visual QA, and audio response."
)

app = gr.mount_gradio_app(app, gui, path="/")

@app.get("/")
def home():
    return RedirectResponse(url="/")