Spaces:

ikraamkb
/

Summarization

Sleeping

App Files Files Community

ikraamkb commited on Apr 18

Commit

315a442

verified ·

1 Parent(s): 9b32604

add download pdf

Browse files

Files changed (1) hide show

app.py +45 -105

app.py CHANGED Viewed

@@ -1,97 +1,3 @@
-"""import gradio as gr
-from transformers import pipeline
-import fitz  # PyMuPDF
-import docx
-import pptx
-import openpyxl
-import os
-from fastapi import FastAPI
-from fastapi.responses import RedirectResponse
-# Load your custom summarization model
-pipe = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
-# Document text extraction function
-def extract_text(file):
-    ext = file.name.split(".")[-1].lower()
-    path = file.name
-    if ext == "pdf":
-        try:
-            with fitz.open(path) as doc:
-                return "\n".join([page.get_text("text") for page in doc])
-        except Exception as e:
-            return f"Error reading PDF: {e}"
-    elif ext == "docx":
-        try:
-            doc = docx.Document(path)
-            return "\n".join([p.text for p in doc.paragraphs])
-        except Exception as e:
-            return f"Error reading DOCX: {e}"
-    elif ext == "pptx":
-        try:
-            prs = pptx.Presentation(path)
-            text = ""
-            for slide in prs.slides:
-                for shape in slide.shapes:
-                    if hasattr(shape, "text"):
-                        text += shape.text + "\n"
-            return text
-        except Exception as e:
-            return f"Error reading PPTX: {e}"
-    elif ext == "xlsx":
-        try:
-            wb = openpyxl.load_workbook(path)
-            text = ""
-            for sheet in wb.sheetnames:
-                for row in wb[sheet].iter_rows(values_only=True):
-                    text += " ".join([str(cell) for cell in row if cell]) + "\n"
-            return text
-        except Exception as e:
-            return f"Error reading XLSX: {e}"
-    else:
-        return "Unsupported file format"
-# Summarization logic
-def summarize_document(file):
-    text = extract_text(file)
-    if "Error" in text or "Unsupported" in text:
-        return text
-    word_count = len(text.split())
-    max_summary_len = max(20, int(word_count * 0.2))
-    try:
-        summary = pipe(text, max_length=max_summary_len, min_length=int(max_summary_len * 0.6), do_sample=False)
-        # Print the summary to debug its structure
-        print(summary)
-        return summary[0]['summary_text']  # Access the correct key for the output
-    except Exception as e:
-        return f"Error during summarization: {e}"
-# Gradio Interface
-demo = gr.Interface(
-    fn=summarize_document,
-    inputs=gr.File(label="Upload a document (PDF, DOCX, PPTX, XLSX)", file_types=[".pdf", ".docx", ".pptx", ".xlsx"]),
-    outputs=gr.Textbox(label="20% Summary"),
-    title="📄 Document Summarizer (20% Length)",
-    description="Upload a document and get a concise summary generated by your custom Hugging Face model."
-)
-# FastAPI setup
-app = FastAPI()
-# Mount Gradio at "/"
-app = gr.mount_gradio_app(app, demo, path="/")
-# Optional root redirect
-@app.get("/")
-def redirect_to_interface():
-    return RedirectResponse(url="/")"""
 import gradio as gr
 from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
 import fitz  # PyMuPDF
@@ -108,6 +14,8 @@ from gtts import gTTS
 import tempfile
 import os
 import easyocr
 # Download required NLTK data
 nltk.download('punkt', quiet=True)
@@ -250,32 +158,63 @@ def text_to_speech(text: str) -> str:
         print(f"Error in text-to-speech: {e}")
         return ""
 def summarize_document(file, summary_length: str, enable_tts: bool):
     """Main processing function for Gradio interface"""
     if file is None:
-        return "Please upload a document first", "Ready", None
     file_path = file.name
     file_extension = file_path.split(".")[-1].lower()
     text, error = extract_text(file_path, file_extension)
     if error:
-        return error, "Error", None
     if not text or len(text.split()) < 30:
-        return "Document is too short or contains too little text to summarize", "Ready", None
     try:
         summary = generate_summary(text, summary_length)
         audio_path = text_to_speech(summary) if enable_tts else None
-        return summary, "Summary complete", audio_path
     except Exception as e:
-        return f"Summarization error: {str(e)}", "Error", None
 # Gradio Interface
 with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 📄 Advanced Document Summarizer")
-    gr.Markdown("Upload a document to generate a summary with optional audio reading")
     with gr.Row():
         with gr.Column():
@@ -299,6 +238,7 @@ with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
             output = gr.Textbox(label="Summary", lines=10)
             status = gr.Textbox(label="Status", interactive=False)
             audio_output = gr.Audio(label="Audio Summary", visible=False)
     def toggle_audio_visibility(enable_tts):
         return gr.Audio(visible=enable_tts)
@@ -312,16 +252,16 @@ with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
     submit_btn.click(
         fn=summarize_document,
         inputs=[file_input, length_radio, tts_checkbox],
-        outputs=[output, status, audio_output],
         api_name="summarize"
     )
-# FastAPI endpoints for audio files
-@app.get("/audio/{file_name}")
-async def get_audio(file_name: str):
     file_path = os.path.join(tempfile.gettempdir(), file_name)
     if os.path.exists(file_path):
-        return FileResponse(file_path, media_type="audio/mpeg")
     return JSONResponse({"error": "File not found"}, status_code=404)
 # Mount Gradio app to FastAPI

 import gradio as gr
 from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
 import fitz  # PyMuPDF
 import tempfile
 import os
 import easyocr
+from fpdf import FPDF
+import datetime
 # Download required NLTK data
 nltk.download('punkt', quiet=True)
         print(f"Error in text-to-speech: {e}")
         return ""
+def create_pdf(summary: str, original_filename: str) -> str:
+    """Create a PDF file from the summary text"""
+    try:
+        # Create PDF object
+        pdf = FPDF()
+        pdf.add_page()
+        pdf.set_font("Arial", size=12)
+        # Add title
+        pdf.set_font("Arial", 'B', 16)
+        pdf.cell(200, 10, txt="Document Summary", ln=1, align='C')
+        pdf.set_font("Arial", size=12)
+        # Add metadata
+        pdf.cell(200, 10, txt=f"Original file: {original_filename}", ln=1)
+        pdf.cell(200, 10, txt=f"Generated on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ln=1)
+        pdf.ln(10)
+        # Add summary content
+        pdf.multi_cell(0, 10, txt=summary)
+        # Save to temporary file
+        temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
+        pdf.output(temp_pdf.name)
+        return temp_pdf.name
+    except Exception as e:
+        print(f"Error creating PDF: {e}")
+        return ""
 def summarize_document(file, summary_length: str, enable_tts: bool):
     """Main processing function for Gradio interface"""
     if file is None:
+        return "Please upload a document first", "Ready", None, None
     file_path = file.name
     file_extension = file_path.split(".")[-1].lower()
+    original_filename = os.path.basename(file_path)
     text, error = extract_text(file_path, file_extension)
     if error:
+        return error, "Error", None, None
     if not text or len(text.split()) < 30:
+        return "Document is too short or contains too little text to summarize", "Ready", None, None
     try:
         summary = generate_summary(text, summary_length)
         audio_path = text_to_speech(summary) if enable_tts else None
+        pdf_path = create_pdf(summary, original_filename)
+        return summary, "Summary complete", audio_path, pdf_path
     except Exception as e:
+        return f"Summarization error: {str(e)}", "Error", None, None
 # Gradio Interface
 with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 📄 Advanced Document Summarizer")
+    gr.Markdown("Upload a document to generate a summary with optional audio reading and PDF download")
     with gr.Row():
         with gr.Column():
             output = gr.Textbox(label="Summary", lines=10)
             status = gr.Textbox(label="Status", interactive=False)
             audio_output = gr.Audio(label="Audio Summary", visible=False)
+            pdf_download = gr.File(label="Download Summary as PDF", visible=False)
     def toggle_audio_visibility(enable_tts):
         return gr.Audio(visible=enable_tts)
     submit_btn.click(
         fn=summarize_document,
         inputs=[file_input, length_radio, tts_checkbox],
+        outputs=[output, status, audio_output, pdf_download],
         api_name="summarize"
     )
+# FastAPI endpoints for files
+@app.get("/files/{file_name}")
+async def get_file(file_name: str):
     file_path = os.path.join(tempfile.gettempdir(), file_name)
     if os.path.exists(file_path):
+        return FileResponse(file_path)
     return JSONResponse({"error": "File not found"}, status_code=404)
 # Mount Gradio app to FastAPI