Spaces:

ikraamkb
/

Summarization

Sleeping

App Files Files Community

ikraamkb commited on Apr 18

Commit

130c582

verified ·

1 Parent(s): 91bdad5

Update app.py

Browse files

Files changed (1) hide show

app.py +174 -1

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import gradio as gr
 from transformers import pipeline
 import fitz  # PyMuPDF
 import docx
@@ -91,4 +91,177 @@ app = gr.mount_gradio_app(app, demo, path="/")
 # Optional root redirect
 @app.get("/")
 def redirect_to_interface():
     return RedirectResponse(url="/")

+"""import gradio as gr
 from transformers import pipeline
 import fitz  # PyMuPDF
 import docx
 # Optional root redirect
 @app.get("/")
 def redirect_to_interface():
+    return RedirectResponse(url="/")"""
+import gradio as gr
+from transformers import pipeline, AutoTokenizer
+import fitz  # PyMuPDF
+import docx
+import pptx
+import openpyxl
+import re
+from nltk.tokenize import sent_tokenize
+from fastapi import FastAPI
+from fastapi.responses import RedirectResponse
+from typing import Optional
+import torch
+# CPU-optimized model loading
+MODEL_NAME = "facebook/bart-large-cnn"  # Good balance of quality and size
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+# Use smaller batch sizes and disable GPU
+pipe = pipeline(
+    "summarization",
+    model=MODEL_NAME,
+    tokenizer=tokenizer,
+    device=-1,  # Force CPU usage
+    torch_dtype=torch.float32  # Use 32-bit floats on CPU
+)
+# Text processing utilities
+def clean_text(text: str) -> str:
+    """Optimized text cleaning for CPU"""
+    text = re.sub(r'\s+', ' ', text)  # Combine whitespace
+    text = re.sub(r'•\s*|\d\.\s+', '', text)  # Remove bullets and numbers
+    text = re.sub(r'\[.*?\]|\(.*?\)', '', text)  # Remove brackets/parentheses
+    return text.strip()
+def split_into_chunks(text: str, max_chunk_size: int = 768) -> list[str]:
+    """CPU-efficient text chunking"""
+    sentences = sent_tokenize(text)
+    chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        if len(current_chunk.split()) + len(sentence.split()) <= max_chunk_size:
+            current_chunk += " " + sentence
+        else:
+            chunks.append(current_chunk.strip())
+            current_chunk = sentence
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
+# Memory-efficient text extraction
+def extract_text(file) -> tuple[Optional[str], Optional[str]]:
+    ext = file.name.split(".")[-1].lower()
+    path = file.name
+    try:
+        if ext == "pdf":
+            text = []
+            with fitz.open(path) as doc:
+                for page in doc:
+                    text.append(page.get_text("text"))
+            return clean_text("\n".join(text)), None
+        elif ext == "docx":
+            doc = docx.Document(path)
+            return clean_text("\n".join(p.text for p in doc.paragraphs)), None
+        elif ext == "pptx":
+            text = []
+            prs = pptx.Presentation(path)
+            for slide in prs.slides:
+                for shape in slide.shapes:
+                    if hasattr(shape, "text"):
+                        text.append(shape.text)
+            return clean_text("\n".join(text)), None
+        elif ext == "xlsx":
+            text = []
+            wb = openpyxl.load_workbook(path, read_only=True)
+            for sheet in wb.sheetnames:
+                for row in wb[sheet].iter_rows(values_only=True):
+                    text.append(" ".join(str(cell) for cell in row if cell))
+            return clean_text("\n".join(text)), None
+        return None, "Unsupported file format"
+    except Exception as e:
+        return None, f"Error reading {ext.upper()}: {str(e)}"
+# CPU-optimized summarization
+def summarize_document(file, summary_length: str = "medium"):
+    # CPU-friendly length parameters
+    length_params = {
+        "short": {"max_length": 80, "min_length": 30},
+        "medium": {"max_length": 150, "min_length": 60},
+        "long": {"max_length": 200, "min_length": 80}
+    }
+    text, error = extract_text(file)
+    if error:
+        return error
+    if not text or len(text.split()) < 30:
+        return "Document too short to summarize meaningfully"
+    try:
+        chunks = split_into_chunks(text)
+        summaries = []
+        for chunk in chunks:
+            summary = pipe(
+                chunk,
+                max_length=length_params[summary_length]["max_length"],
+                min_length=length_params[summary_length]["min_length"],
+                do_sample=False,
+                truncation=True,
+                no_repeat_ngram_size=2,  # Reduced from 3 for CPU
+                num_beams=2,  # Reduced from 4 for CPU
+                early_stopping=True
+            )
+            summaries.append(summary[0]['summary_text'])
+        # Efficient summary combination
+        final_summary = " ".join(summaries)
+        final_summary = ". ".join(s.strip().capitalize()
+                                 for s in final_summary.split(". ")
+                                 if s.strip())
+        return final_summary if len(final_summary) > 25 else "Summary too short - try a longer document"
+    except Exception as e:
+        return f"Summarization error: {str(e)}"
+# Lightweight Gradio interface
+with gr.Blocks(title="CPU Document Summarizer", theme="soft") as demo:
+    gr.Markdown("## 📄 CPU-Optimized Document Summarizer")
+    with gr.Row():
+        with gr.Column():
+            file_input = gr.File(
+                label="Upload Document",
+                file_types=[".pdf", ".docx", ".pptx", ".xlsx"],
+                type="filepath"
+            )
+            length_radio = gr.Radio(
+                ["short", "medium", "long"],
+                value="medium",
+                label="Summary Length"
+            )
+            submit_btn = gr.Button("Summarize", variant="primary")
+        with gr.Column():
+            output = gr.Textbox(label="Summary", lines=8)
+            status = gr.Textbox(label="Status", interactive=False)
+    @submit_btn.click(inputs=[file_input, length_radio], outputs=[output, status])
+    def process(file, length):
+        if not file:
+            return "", "Error: No file uploaded"
+        status = "Processing... (this may take a while on CPU)"
+        summary = summarize_document(file, length)
+        return summary, "Done"
+# FastAPI setup
+app = FastAPI()
+@app.get("/")
+def redirect():
     return RedirectResponse(url="/")
+app = gr.mount_gradio_app(app, demo, path="/")