File size: 2,815 Bytes
0d84ecf
3a44bf2
0d84ecf
 
3a44bf2
 
 
c330600
3a44bf2
 
461e409
3a44bf2
 
c3071ac
3a44bf2
 
 
 
c3071ac
3a44bf2
 
 
 
 
 
5e30a65
3a44bf2
 
 
 
 
 
5e30a65
3a44bf2
 
 
 
 
 
 
 
 
 
 
0d84ecf
3a44bf2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d84ecf
5e30a65
3a44bf2
 
5e30a65
3a44bf2
0d84ecf
 
3a44bf2
 
 
 
 
 
 
 
 
 
0d84ecf
3a44bf2
0d84ecf
c330600
3a44bf2
5b4fc38
c3071ac
3a44bf2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import gradio as gr
from transformers import pipeline
import fitz  # PyMuPDF
import docx
import pptx
import openpyxl
import os

from fastapi import FastAPI
from fastapi.responses import RedirectResponse

# Load your custom summarization model
pipe = pipeline("text2text-generation", model="FeruzaBoynazarovaas/my_awesome_billsum_model")

# Document text extraction function
def extract_text(file):
    ext = file.name.split(".")[-1].lower()
    path = file.name

    if ext == "pdf":
        try:
            with fitz.open(path) as doc:
                return "\n".join([page.get_text("text") for page in doc])
        except Exception as e:
            return f"Error reading PDF: {e}"

    elif ext == "docx":
        try:
            doc = docx.Document(path)
            return "\n".join([p.text for p in doc.paragraphs])
        except Exception as e:
            return f"Error reading DOCX: {e}"

    elif ext == "pptx":
        try:
            prs = pptx.Presentation(path)
            text = ""
            for slide in prs.slides:
                for shape in slide.shapes:
                    if hasattr(shape, "text"):
                        text += shape.text + "\n"
            return text
        except Exception as e:
            return f"Error reading PPTX: {e}"

    elif ext == "xlsx":
        try:
            wb = openpyxl.load_workbook(path)
            text = ""
            for sheet in wb.sheetnames:
                for row in wb[sheet].iter_rows(values_only=True):
                    text += " ".join([str(cell) for cell in row if cell]) + "\n"
            return text
        except Exception as e:
            return f"Error reading XLSX: {e}"
    else:
        return "Unsupported file format"

# Summarization logic
def summarize_document(file):
    text = extract_text(file)
    if "Error" in text or "Unsupported" in text:
        return text

    word_count = len(text.split())
    max_summary_len = max(20, int(word_count * 0.2))

    try:
        summary = pipe(text, max_length=max_summary_len, min_length=int(max_summary_len * 0.6), do_sample=False)
        return summary[0]['generated_text']
    except Exception as e:
        return f"Error during summarization: {e}"

# Gradio Interface
demo = gr.Interface(
    fn=summarize_document,
    inputs=gr.File(label="Upload a document (PDF, DOCX, PPTX, XLSX)", file_types=[".pdf", ".docx", ".pptx", ".xlsx"]),
    outputs=gr.Textbox(label="20% Summary"),
    title="πŸ“„ Document Summarizer (20% Length)",
    description="Upload a document and get a concise summary generated by your custom Hugging Face model."
)

# FastAPI setup
app = FastAPI()

# Mount Gradio at "/"
app = gr.mount_gradio_app(app, demo, path="/")

# Optional root redirect
@app.get("/")
def redirect_to_interface():
    return RedirectResponse(url="/")