Spaces:
Running
Running
File size: 2,199 Bytes
95c2451 5b4fc38 3e87c53 fac31c8 653c3ae fac31c8 e5b6ad2 1e83db4 a74f8b0 fac31c8 3fb07d9 fac31c8 cf9a79a fac31c8 cf9a79a af32fa4 fac31c8 cf9a79a fac31c8 6dfac5c fac31c8 3fb07d9 fac31c8 12d05c0 fac31c8 12d05c0 fac31c8 12d05c0 fac31c8 12d05c0 3e87c53 fac31c8 5b4fc38 fac31c8 3e87c53 5b4fc38 fac31c8 5b4fc38 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
from fastapi import FastAPI
from fastapi.responses import RedirectResponse
import gradio as gr
from PIL import Image
import io
import numpy as np
import easyocr
from transformers import pipeline
from gtts import gTTS
import tempfile
import os
app = FastAPI()
# OCR Reader
ocr_reader = easyocr.Reader(['en'], gpu=False)
# Captioning and VQA Pipelines
caption_model = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
vqa_model = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
def process_image_question(image: Image.Image, question: str):
if image is None:
return "No image uploaded.", None
try:
# Convert PIL image to numpy array
np_image = np.array(image)
# OCR extraction
ocr_texts = ocr_reader.readtext(np_image, detail=0)
extracted_text = "\n".join(ocr_texts)
# Generate caption
caption = caption_model(image)[0]['generated_text']
# Ask question on image using VQA
vqa_result = vqa_model(image=image, question=question)
answer = vqa_result[0]['answer']
# Combine results
final_output = f"๐ผ๏ธ Caption: {caption}\n\n๐ OCR Text:\n{extracted_text}\n\nโ Answer: {answer}"
# Convert answer to speech
tts = gTTS(text=answer)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
tts.save(tmp.name)
audio_path = tmp.name
return final_output, audio_path
except Exception as e:
return f"โ Error processing image: {e}", None
gui = gr.Interface(
fn=process_image_question,
inputs=[
gr.Image(type="pil", label="Upload Image"),
gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question")
],
outputs=[
gr.Textbox(label="Result", lines=10),
gr.Audio(label="Answer (Audio)", type="filepath")
],
title="๐ง Image QA with Voice",
description="Upload an image and ask any question. Supports OCR, captioning, visual QA, and audio response."
)
app = gr.mount_gradio_app(app, gui, path="/")
@app.get("/")
def home():
return RedirectResponse(url="/")
|