Spaces:
Running
Running
File size: 2,058 Bytes
95c2451 5b4fc38 3e87c53 fac31c8 653c3ae fac31c8 05e0b44 b9ed063 1e83db4 a74f8b0 b298682 fac31c8 05e0b44 cf9a79a fac31c8 cf9a79a af32fa4 b298682 fac31c8 cf9a79a 05e0b44 6dfac5c b298682 fac31c8 3fb07d9 b298682 fac31c8 12d05c0 b298682 fac31c8 12d05c0 b298682 fac31c8 12d05c0 3e87c53 b298682 fac31c8 b298682 5b4fc38 fac31c8 3e87c53 5b4fc38 fac31c8 5b4fc38 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
from fastapi import FastAPI
from fastapi.responses import RedirectResponse
import gradio as gr
from PIL import Image
import numpy as np
from transformers import pipeline
from gtts import gTTS
import tempfile
import os
import pytesseract # β
Replacing easyocr
app = FastAPI()
# Models
caption_model = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
vqa_model = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
def process_image_question(image: Image.Image, question: str):
if image is None:
return "No image uploaded.", None
try:
# Convert image to numpy
np_image = np.array(image)
# OCR text using pytesseract
extracted_text = pytesseract.image_to_string(image)
# Caption
caption = caption_model(image)[0]['generated_text']
# Visual QA
vqa_result = vqa_model(image=image, question=question)
answer = vqa_result[0]['answer']
# Answer as speech
tts = gTTS(text=answer)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
tts.save(tmp.name)
audio_path = tmp.name
final_output = f"πΌοΈ Caption: {caption}\n\nπ OCR Text:\n{extracted_text}\n\nβ Answer: {answer}"
return final_output, audio_path
except Exception as e:
return f"β Error: {e}", None
gui = gr.Interface(
fn=process_image_question,
inputs=[
gr.Image(type="pil", label="Upload Image"),
gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question")
],
outputs=[
gr.Textbox(label="Result", lines=10),
gr.Audio(label="Answer (Audio)", type="filepath")
],
title="π§ Image QA with Voice",
description="Upload an image and ask any question β even if there's no readable text. The app will use OCR, captioning, visual QA, and read answers aloud."
)
app = gr.mount_gradio_app(app, gui, path="/")
@app.get("/")
def home():
return RedirectResponse(url="/")
|