Spaces:
Running
Running
File size: 2,394 Bytes
95c2451 5b4fc38 3e87c53 fac31c8 653c3ae fac31c8 e5b6ad2 b9ed063 1e83db4 a74f8b0 aa2a251 3fb07d9 fac31c8 cf9a79a fac31c8 cf9a79a af32fa4 aa2a251 fac31c8 cf9a79a fac31c8 6dfac5c fac31c8 3fb07d9 fac31c8 12d05c0 fac31c8 12d05c0 fac31c8 12d05c0 fac31c8 12d05c0 3e87c53 fac31c8 5b4fc38 fac31c8 3e87c53 5b4fc38 fac31c8 5b4fc38 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
from fastapi import FastAPI
from fastapi.responses import RedirectResponse
import gradio as gr
from PIL import Image
import numpy as np
from transformers import pipeline
from gtts import gTTS
import tempfile
import os
app = FastAPI()
# OCR Reader (lazy import inside function to avoid ImportError on Spaces)
ocr_reader = None
# Captioning and VQA Pipelines
caption_model = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
vqa_model = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
def process_image_question(image: Image.Image, question: str):
if image is None:
return "No image uploaded.", None
try:
# Import EasyOCR only when needed
global ocr_reader
if ocr_reader is None:
import easyocr
ocr_reader = easyocr.Reader(['en'], gpu=False)
# Convert PIL image to numpy array
np_image = np.array(image)
# OCR extraction
ocr_texts = ocr_reader.readtext(np_image, detail=0)
extracted_text = "\n".join(ocr_texts)
# Generate caption
caption = caption_model(image)[0]['generated_text']
# Ask question on image using VQA
vqa_result = vqa_model(image=image, question=question)
answer = vqa_result[0]['answer']
# Combine results
final_output = f"๐ผ๏ธ Caption: {caption}\n\n๐ OCR Text:\n{extracted_text}\n\nโ Answer: {answer}"
# Convert answer to speech
tts = gTTS(text=answer)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
tts.save(tmp.name)
audio_path = tmp.name
return final_output, audio_path
except Exception as e:
return f"โ Error processing image: {e}", None
gui = gr.Interface(
fn=process_image_question,
inputs=[
gr.Image(type="pil", label="Upload Image"),
gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question")
],
outputs=[
gr.Textbox(label="Result", lines=10),
gr.Audio(label="Answer (Audio)", type="filepath")
],
title="๐ง Image QA with Voice",
description="Upload an image and ask any question. Supports OCR, captioning, visual QA, and audio response."
)
app = gr.mount_gradio_app(app, gui, path="/")
@app.get("/")
def home():
return RedirectResponse(url="/")
|