Spaces:

ikraamkb
/

Summarization

Sleeping

App Files Files Community

Summarization / app.py

ikraamkb

Update app.py

44d6661 verified 6 months ago

raw

history blame

3.87 kB

	from fastapi import FastAPI
	from fastapi.responses import RedirectResponse
	import fitz
	import docx
	import openpyxl
	import pptx
	import io
	import os
	import tempfile
	from PIL import Image
	import gradio as gr
	from transformers import pipeline

	# Load models
	summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
	image_captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")

	app = FastAPI()

	# -------------------------
	# Extraction Functions
	# -------------------------
	def extract_text_from_pdf(file_bytes):
	try:
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
	tmp.write(file_bytes)
	tmp_path = tmp.name
	with fitz.open(tmp_path) as doc:
	text = "\n".join(page.get_text() for page in doc)
	os.unlink(tmp_path)
	return text
	except Exception as e:
	return f"❌ PDF extraction error: {e}"

	def extract_text_from_docx(file_bytes):
	try:
	doc = docx.Document(io.BytesIO(file_bytes))
	return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
	except Exception as e:
	return f"❌ DOCX extraction error: {e}"

	def extract_text_from_pptx(file_bytes):
	try:
	prs = pptx.Presentation(io.BytesIO(file_bytes))
	text = []
	for slide in prs.slides:
	for shape in slide.shapes:
	if hasattr(shape, "text"):
	text.append(shape.text)
	return "\n".join(text)
	except Exception as e:
	return f"❌ PPTX extraction error: {e}"

	def extract_text_from_xlsx(file_bytes):
	try:
	wb = openpyxl.load_workbook(io.BytesIO(file_bytes), read_only=True, data_only=True)
	text = []
	for sheet in wb.sheetnames:
	ws = wb[sheet]
	for row in ws.iter_rows(values_only=True):
	line = " ".join(str(cell) for cell in row if cell)
	text.append(line)
	return "\n".join(text)
	except Exception as e:
	return f"❌ XLSX extraction error: {e}"

	# -------------------------
	# Main Logic
	# -------------------------
	def summarize_document(file):
	file_bytes = file.read()
	filename = getattr(file, "name", "").lower()

	if filename.endswith(".pdf"):
	text = extract_text_from_pdf(file_bytes)
	elif filename.endswith(".docx"):
	text = extract_text_from_docx(file_bytes)
	elif filename.endswith(".pptx"):
	text = extract_text_from_pptx(file_bytes)
	elif filename.endswith(".xlsx"):
	text = extract_text_from_xlsx(file_bytes)
	else:
	return "❌ Unsupported file format."

	if not text or not text.strip():
	return "❗ No extractable text found."

	try:
	summary = summarizer(text[:3000], max_length=150, min_length=30, do_sample=False)
	return f"📄 Summary:\n{summary[0]['summary_text']}"
	except Exception as e:
	return f"⚠️ Summarization error: {e}"

	def interpret_image(image):
	try:
	return f"🖼️ Caption:\n{image_captioner(image)[0]['generated_text']}"
	except Exception as e:
	return f"⚠️ Image captioning error: {e}"

	# -------------------------
	# Gradio Interfaces
	# -------------------------
	doc_summary = gr.Interface(
	fn=summarize_document,
	inputs=gr.File(label="Upload a Document"),
	outputs="text",
	title="📄 Document Summarizer"
	)

	img_caption = gr.Interface(
	fn=interpret_image,
	inputs=gr.Image(type="pil", label="Upload an Image"),
	outputs="text",
	title="🖼️ Image Interpreter"
	)

	# -------------------------
	# FastAPI Integration
	# -------------------------
	demo = gr.TabbedInterface([doc_summary, img_caption], ["Document Summary", "Image Captioning"])
	app = gr.mount_gradio_app(app, demo, path="/")

	@app.get("/")
	def home():
	return RedirectResponse(url="/")