Summarization / app.py
ikraamkb's picture
Update app.py
44d6661 verified
raw
history blame
3.87 kB
from fastapi import FastAPI
from fastapi.responses import RedirectResponse
import fitz
import docx
import openpyxl
import pptx
import io
import os
import tempfile
from PIL import Image
import gradio as gr
from transformers import pipeline
# Load models
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
image_captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
app = FastAPI()
# -------------------------
# Extraction Functions
# -------------------------
def extract_text_from_pdf(file_bytes):
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
tmp.write(file_bytes)
tmp_path = tmp.name
with fitz.open(tmp_path) as doc:
text = "\n".join(page.get_text() for page in doc)
os.unlink(tmp_path)
return text
except Exception as e:
return f"❌ PDF extraction error: {e}"
def extract_text_from_docx(file_bytes):
try:
doc = docx.Document(io.BytesIO(file_bytes))
return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
except Exception as e:
return f"❌ DOCX extraction error: {e}"
def extract_text_from_pptx(file_bytes):
try:
prs = pptx.Presentation(io.BytesIO(file_bytes))
text = []
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
text.append(shape.text)
return "\n".join(text)
except Exception as e:
return f"❌ PPTX extraction error: {e}"
def extract_text_from_xlsx(file_bytes):
try:
wb = openpyxl.load_workbook(io.BytesIO(file_bytes), read_only=True, data_only=True)
text = []
for sheet in wb.sheetnames:
ws = wb[sheet]
for row in ws.iter_rows(values_only=True):
line = " ".join(str(cell) for cell in row if cell)
text.append(line)
return "\n".join(text)
except Exception as e:
return f"❌ XLSX extraction error: {e}"
# -------------------------
# Main Logic
# -------------------------
def summarize_document(file):
file_bytes = file.read()
filename = getattr(file, "name", "").lower()
if filename.endswith(".pdf"):
text = extract_text_from_pdf(file_bytes)
elif filename.endswith(".docx"):
text = extract_text_from_docx(file_bytes)
elif filename.endswith(".pptx"):
text = extract_text_from_pptx(file_bytes)
elif filename.endswith(".xlsx"):
text = extract_text_from_xlsx(file_bytes)
else:
return "❌ Unsupported file format."
if not text or not text.strip():
return "❗ No extractable text found."
try:
summary = summarizer(text[:3000], max_length=150, min_length=30, do_sample=False)
return f"πŸ“„ Summary:\n{summary[0]['summary_text']}"
except Exception as e:
return f"⚠️ Summarization error: {e}"
def interpret_image(image):
try:
return f"πŸ–ΌοΈ Caption:\n{image_captioner(image)[0]['generated_text']}"
except Exception as e:
return f"⚠️ Image captioning error: {e}"
# -------------------------
# Gradio Interfaces
# -------------------------
doc_summary = gr.Interface(
fn=summarize_document,
inputs=gr.File(label="Upload a Document"),
outputs="text",
title="πŸ“„ Document Summarizer"
)
img_caption = gr.Interface(
fn=interpret_image,
inputs=gr.Image(type="pil", label="Upload an Image"),
outputs="text",
title="πŸ–ΌοΈ Image Interpreter"
)
# -------------------------
# FastAPI Integration
# -------------------------
demo = gr.TabbedInterface([doc_summary, img_caption], ["Document Summary", "Image Captioning"])
app = gr.mount_gradio_app(app, demo, path="/")
@app.get("/")
def home():
return RedirectResponse(url="/")