Spaces:
Running
Running
File size: 2,815 Bytes
0d84ecf 3a44bf2 0d84ecf 3a44bf2 c330600 3a44bf2 461e409 3a44bf2 c3071ac 3a44bf2 c3071ac 3a44bf2 5e30a65 3a44bf2 5e30a65 3a44bf2 0d84ecf 3a44bf2 0d84ecf 5e30a65 3a44bf2 5e30a65 3a44bf2 0d84ecf 3a44bf2 0d84ecf 3a44bf2 0d84ecf c330600 3a44bf2 5b4fc38 c3071ac 3a44bf2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import gradio as gr
from transformers import pipeline
import fitz # PyMuPDF
import docx
import pptx
import openpyxl
import os
from fastapi import FastAPI
from fastapi.responses import RedirectResponse
# Load your custom summarization model
pipe = pipeline("text2text-generation", model="FeruzaBoynazarovaas/my_awesome_billsum_model")
# Document text extraction function
def extract_text(file):
ext = file.name.split(".")[-1].lower()
path = file.name
if ext == "pdf":
try:
with fitz.open(path) as doc:
return "\n".join([page.get_text("text") for page in doc])
except Exception as e:
return f"Error reading PDF: {e}"
elif ext == "docx":
try:
doc = docx.Document(path)
return "\n".join([p.text for p in doc.paragraphs])
except Exception as e:
return f"Error reading DOCX: {e}"
elif ext == "pptx":
try:
prs = pptx.Presentation(path)
text = ""
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
text += shape.text + "\n"
return text
except Exception as e:
return f"Error reading PPTX: {e}"
elif ext == "xlsx":
try:
wb = openpyxl.load_workbook(path)
text = ""
for sheet in wb.sheetnames:
for row in wb[sheet].iter_rows(values_only=True):
text += " ".join([str(cell) for cell in row if cell]) + "\n"
return text
except Exception as e:
return f"Error reading XLSX: {e}"
else:
return "Unsupported file format"
# Summarization logic
def summarize_document(file):
text = extract_text(file)
if "Error" in text or "Unsupported" in text:
return text
word_count = len(text.split())
max_summary_len = max(20, int(word_count * 0.2))
try:
summary = pipe(text, max_length=max_summary_len, min_length=int(max_summary_len * 0.6), do_sample=False)
return summary[0]['generated_text']
except Exception as e:
return f"Error during summarization: {e}"
# Gradio Interface
demo = gr.Interface(
fn=summarize_document,
inputs=gr.File(label="Upload a document (PDF, DOCX, PPTX, XLSX)", file_types=[".pdf", ".docx", ".pptx", ".xlsx"]),
outputs=gr.Textbox(label="20% Summary"),
title="π Document Summarizer (20% Length)",
description="Upload a document and get a concise summary generated by your custom Hugging Face model."
)
# FastAPI setup
app = FastAPI()
# Mount Gradio at "/"
app = gr.mount_gradio_app(app, demo, path="/")
# Optional root redirect
@app.get("/")
def redirect_to_interface():
return RedirectResponse(url="/")
|