Spaces:

ikraamkb
/

Summarization

Sleeping

App Files Files Community

Summarization / app.py

ikraamkb

Update app.py

aa2a251 verified 3 months ago

raw

history blame

2.39 kB

	from fastapi import FastAPI
	from fastapi.responses import RedirectResponse
	import gradio as gr
	from PIL import Image
	import numpy as np
	from transformers import pipeline
	from gtts import gTTS
	import tempfile
	import os

	app = FastAPI()

	# OCR Reader (lazy import inside function to avoid ImportError on Spaces)
	ocr_reader = None

	# Captioning and VQA Pipelines
	caption_model = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
	vqa_model = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")

	def process_image_question(image: Image.Image, question: str):
	if image is None:
	return "No image uploaded.", None

	try:
	# Import EasyOCR only when needed
	global ocr_reader
	if ocr_reader is None:
	import easyocr
	ocr_reader = easyocr.Reader(['en'], gpu=False)

	# Convert PIL image to numpy array
	np_image = np.array(image)

	# OCR extraction
	ocr_texts = ocr_reader.readtext(np_image, detail=0)
	extracted_text = "\n".join(ocr_texts)

	# Generate caption
	caption = caption_model(image)[0]['generated_text']

	# Ask question on image using VQA
	vqa_result = vqa_model(image=image, question=question)
	answer = vqa_result[0]['answer']

	# Combine results
	final_output = f"🖼️ Caption: {caption}\n\n📖 OCR Text:\n{extracted_text}\n\n❓ Answer: {answer}"

	# Convert answer to speech
	tts = gTTS(text=answer)
	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
	tts.save(tmp.name)
	audio_path = tmp.name

	return final_output, audio_path

	except Exception as e:
	return f"❌ Error processing image: {e}", None

	gui = gr.Interface(
	fn=process_image_question,
	inputs=[
	gr.Image(type="pil", label="Upload Image"),
	gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question")
	],
	outputs=[
	gr.Textbox(label="Result", lines=10),
	gr.Audio(label="Answer (Audio)", type="filepath")
	],
	title="🧠 Image QA with Voice",
	description="Upload an image and ask any question. Supports OCR, captioning, visual QA, and audio response."
	)

	app = gr.mount_gradio_app(app, gui, path="/")

	@app.get("/")
	def home():
	return RedirectResponse(url="/")