Spaces:

ikraamkb
/

Summarization

Sleeping

App Files Files Community

ikraamkb commited on Apr 18

Commit

551e732

verified ·

1 Parent(s): 145f8e8

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -100

app.py CHANGED Viewed

@@ -93,143 +93,160 @@ app = gr.mount_gradio_app(app, demo, path="/")
 def redirect_to_interface():
     return RedirectResponse(url="/")"""
 import gradio as gr
-from transformers import pipeline, AutoTokenizer
 import fitz  # PyMuPDF
 import docx
 import pptx
 import openpyxl
 import re
 from nltk.tokenize import sent_tokenize
 from fastapi import FastAPI
 from fastapi.responses import RedirectResponse
-from typing import Optional
-import torch
-# CPU-optimized model loading
-MODEL_NAME = "facebook/bart-large-cnn"  # Good balance of quality and size
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-# Use smaller batch sizes and disable GPU
-pipe = pipeline(
     "summarization",
-    model=MODEL_NAME,
     tokenizer=tokenizer,
     device=-1,  # Force CPU usage
-    torch_dtype=torch.float32  # Use 32-bit floats on CPU
 )
-# Text processing utilities
 def clean_text(text: str) -> str:
-    """Optimized text cleaning for CPU"""
-    text = re.sub(r'\s+', ' ', text)  # Combine whitespace
-    text = re.sub(r'•\s*|\d\.\s+', '', text)  # Remove bullets and numbers
     text = re.sub(r'\[.*?\]|\(.*?\)', '', text)  # Remove brackets/parentheses
     return text.strip()
-def split_into_chunks(text: str, max_chunk_size: int = 768) -> list[str]:
-    """CPU-efficient text chunking"""
-    sentences = sent_tokenize(text)
-    chunks = []
-    current_chunk = ""
-    for sentence in sentences:
-        if len(current_chunk.split()) + len(sentence.split()) <= max_chunk_size:
-            current_chunk += " " + sentence
-        else:
-            chunks.append(current_chunk.strip())
-            current_chunk = sentence
-    if current_chunk:
-        chunks.append(current_chunk.strip())
-    return chunks
-# Memory-efficient text extraction
-def extract_text(file) -> tuple[Optional[str], Optional[str]]:
-    ext = file.name.split(".")[-1].lower()
-    path = file.name
     try:
-        if ext == "pdf":
-            text = []
-            with fitz.open(path) as doc:
-                for page in doc:
-                    text.append(page.get_text("text"))
-            return clean_text("\n".join(text)), None
-        elif ext == "docx":
-            doc = docx.Document(path)
-            return clean_text("\n".join(p.text for p in doc.paragraphs)), None
-        elif ext == "pptx":
             text = []
-            prs = pptx.Presentation(path)
             for slide in prs.slides:
                 for shape in slide.shapes:
                     if hasattr(shape, "text"):
                         text.append(shape.text)
-            return clean_text("\n".join(text)), None
-        elif ext == "xlsx":
             text = []
-            wb = openpyxl.load_workbook(path, read_only=True)
             for sheet in wb.sheetnames:
                 for row in wb[sheet].iter_rows(values_only=True):
                     text.append(" ".join(str(cell) for cell in row if cell))
-            return clean_text("\n".join(text)), None
-        return None, "Unsupported file format"
     except Exception as e:
-        return None, f"Error reading {ext.upper()}: {str(e)}"
-# CPU-optimized summarization
-def summarize_document(file, summary_length: str = "medium"):
-    # CPU-friendly length parameters
     length_params = {
         "short": {"max_length": 80, "min_length": 30},
         "medium": {"max_length": 150, "min_length": 60},
         "long": {"max_length": 200, "min_length": 80}
     }
-    text, error = extract_text(file)
-    if error:
-        return error
-    if not text or len(text.split()) < 30:
-        return "Document too short to summarize meaningfully"
-    try:
-        chunks = split_into_chunks(text)
-        summaries = []
-        for chunk in chunks:
-            summary = pipe(
                 chunk,
-                max_length=length_params[summary_length]["max_length"],
-                min_length=length_params[summary_length]["min_length"],
                 do_sample=False,
                 truncation=True,
-                no_repeat_ngram_size=2,  # Reduced from 3 for CPU
-                num_beams=2,  # Reduced from 4 for CPU
                 early_stopping=True
             )
             summaries.append(summary[0]['summary_text'])
-        # Efficient summary combination
-        final_summary = " ".join(summaries)
-        final_summary = ". ".join(s.strip().capitalize()
-                                 for s in final_summary.split(". ")
-                                 if s.strip())
-        return final_summary if len(final_summary) > 25 else "Summary too short - try a longer document"
     except Exception as e:
         return f"Summarization error: {str(e)}"
-# Lightweight Gradio interface
-with gr.Blocks(title="CPU Document Summarizer", theme="soft") as demo:
-    gr.Markdown("## 📄 CPU-Optimized Document Summarizer")
     with gr.Row():
         with gr.Column():
@@ -243,24 +260,22 @@ with gr.Blocks(title="CPU Document Summarizer", theme="soft") as demo:
                 value="medium",
                 label="Summary Length"
             )
-            submit_btn = gr.Button("Summarize", variant="primary")
         with gr.Column():
-            output = gr.Textbox(label="Summary", lines=8)
             status = gr.Textbox(label="Status", interactive=False)
-    @submit_btn.click(inputs=[file_input, length_radio], outputs=[output, status])
-    def process(file, length):
-        if not file:
-            return "", "Error: No file uploaded"
-        status = "Processing... (this may take a while on CPU)"
-        summary = summarize_document(file, length)
-        return summary, "Done"
-# FastAPI setup
-app = FastAPI()
 app = gr.mount_gradio_app(app, demo, path="/")
 @app.get("/")
-def redirect():
-    return RedirectResponse(url="/")

 def redirect_to_interface():
     return RedirectResponse(url="/")"""
 import gradio as gr
+from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
 import fitz  # PyMuPDF
 import docx
 import pptx
 import openpyxl
 import re
+import nltk
 from nltk.tokenize import sent_tokenize
+import torch
 from fastapi import FastAPI
 from fastapi.responses import RedirectResponse
+import os
+# Download required NLTK data
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    nltk.download('punkt')
+# Initialize components
+app = FastAPI()
+# Load summarization model (CPU optimized)
+MODEL_NAME = "facebook/bart-large-cnn"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
+summarizer = pipeline(
     "summarization",
+    model=model,
     tokenizer=tokenizer,
     device=-1,  # Force CPU usage
+    torch_dtype=torch.float32
 )
 def clean_text(text: str) -> str:
+    """Clean and normalize document text"""
+    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
+    text = re.sub(r'•\s*|\d\.\s+', '', text)  # Remove bullets and numbering
     text = re.sub(r'\[.*?\]|\(.*?\)', '', text)  # Remove brackets/parentheses
+    text = re.sub(r'\bPage\s*\d+\b', '', text, flags=re.IGNORECASE)  # Remove page numbers
     return text.strip()
+def extract_text(file_path: str, file_extension: str) -> tuple[str, str]:
+    """Extract text from various document formats"""
     try:
+        if file_extension == "pdf":
+            with fitz.open(file_path) as doc:
+                return clean_text("\n".join(page.get_text("text") for page in doc)), ""
+        elif file_extension == "docx":
+            doc = docx.Document(file_path)
+            return clean_text("\n".join(p.text for p in doc.paragraphs)), ""
+        elif file_extension == "pptx":
+            prs = pptx.Presentation(file_path)
             text = []
             for slide in prs.slides:
                 for shape in slide.shapes:
                     if hasattr(shape, "text"):
                         text.append(shape.text)
+            return clean_text("\n".join(text)), ""
+        elif file_extension == "xlsx":
+            wb = openpyxl.load_workbook(file_path, read_only=True)
             text = []
             for sheet in wb.sheetnames:
                 for row in wb[sheet].iter_rows(values_only=True):
                     text.append(" ".join(str(cell) for cell in row if cell))
+            return clean_text("\n".join(text)), ""
+        return "", "Unsupported file format"
     except Exception as e:
+        return "", f"Error reading {file_extension.upper()} file: {str(e)}"
+def chunk_text(text: str, max_tokens: int = 768) -> list[str]:
+    """Split text into manageable chunks for summarization"""
+    try:
+        sentences = sent_tokenize(text)
+    except:
+        # Fallback if sentence tokenization fails
+        words = text.split()
+        sentences = [' '.join(words[i:i+20]) for i in range(0, len(words), 20)]
+    chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        if len(current_chunk.split()) + len(sentence.split()) <= max_tokens:
+            current_chunk += " " + sentence
+        else:
+            chunks.append(current_chunk.strip())
+            current_chunk = sentence
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
+def generate_summary(text: str, length: str = "medium") -> str:
+    """Generate summary with appropriate length parameters"""
     length_params = {
         "short": {"max_length": 80, "min_length": 30},
         "medium": {"max_length": 150, "min_length": 60},
         "long": {"max_length": 200, "min_length": 80}
     }
+    chunks = chunk_text(text)
+    summaries = []
+    for chunk in chunks:
+        try:
+            summary = summarizer(
                 chunk,
+                max_length=length_params[length]["max_length"],
+                min_length=length_params[length]["min_length"],
                 do_sample=False,
                 truncation=True,
+                no_repeat_ngram_size=2,
+                num_beams=2,
                 early_stopping=True
             )
             summaries.append(summary[0]['summary_text'])
+        except Exception as e:
+            summaries.append(f"[Chunk error: {str(e)}]")
+    # Combine and format the final summary
+    final_summary = " ".join(summaries)
+    final_summary = ". ".join(s.strip().capitalize() for s in final_summary.split(". ") if s.strip())
+    return final_summary if len(final_summary) > 25 else "Summary too short - document may be too brief"
+def summarize_document(file, summary_length: str):
+    """Main processing function for Gradio interface"""
+    if file is None:
+        return "Please upload a document first"
+    file_path = file.name
+    file_extension = file_path.split(".")[-1].lower()
+    text, error = extract_text(file_path, file_extension)
+    if error:
+        return error
+    if not text or len(text.split()) < 30:
+        return "Document is too short or contains too little text to summarize"
+    try:
+        return generate_summary(text, summary_length)
     except Exception as e:
         return f"Summarization error: {str(e)}"
+# Gradio Interface
+with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 📄 Document Summarizer")
+    gr.Markdown("Upload a document to generate a concise summary")
     with gr.Row():
         with gr.Column():
                 value="medium",
                 label="Summary Length"
             )
+            submit_btn = gr.Button("Generate Summary", variant="primary")
         with gr.Column():
+            output = gr.Textbox(label="Summary", lines=10)
             status = gr.Textbox(label="Status", interactive=False)
+    submit_btn.click(
+        fn=summarize_document,
+        inputs=[file_input, length_radio],
+        outputs=[output, status],
+        api_name="summarize"
+    )
+# Mount Gradio app to FastAPI
 app = gr.mount_gradio_app(app, demo, path="/")
 @app.get("/")
+def redirect_to_interface():
+    return RedirectResponse(url="/")