Spaces:

ikraamkb
/

Summarization

Sleeping

App Files Files Community

ikraamkb commited on Apr 18

Commit

9b32604

verified ·

1 Parent(s): 40fa0d9

add the readers

Browse files

Files changed (1) hide show

app.py +68 -14

app.py CHANGED Viewed

@@ -103,7 +103,11 @@ import nltk
 from nltk.tokenize import sent_tokenize
 import torch
 from fastapi import FastAPI
-from fastapi.responses import RedirectResponse
 # Download required NLTK data
 nltk.download('punkt', quiet=True)
@@ -111,7 +115,7 @@ nltk.download('punkt', quiet=True)
 # Initialize components
 app = FastAPI()
-# Load summarization model (CPU optimized)
 MODEL_NAME = "facebook/bart-large-cnn"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
@@ -123,6 +127,9 @@ summarizer = pipeline(
     torch_dtype=torch.float32
 )
 def clean_text(text: str) -> str:
     """Clean and normalize document text"""
     text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
@@ -136,7 +143,16 @@ def extract_text(file_path: str, file_extension: str) -> tuple[str, str]:
     try:
         if file_extension == "pdf":
             with fitz.open(file_path) as doc:
-                return clean_text("\n".join(page.get_text("text") for page in doc)), ""
         elif file_extension == "docx":
             doc = docx.Document(file_path)
@@ -159,6 +175,10 @@ def extract_text(file_path: str, file_extension: str) -> tuple[str, str]:
                     text.append(" ".join(str(cell) for cell in row if cell))
             return clean_text("\n".join(text)), ""
         return "", "Unsupported file format"
     except Exception as e:
         return "", f"Error reading {file_extension.upper()} file: {str(e)}"
@@ -219,37 +239,49 @@ def generate_summary(text: str, length: str = "medium") -> str:
     final_summary = ". ".join(s.strip().capitalize() for s in final_summary.split(". ") if s.strip())
     return final_summary if len(final_summary) > 25 else "Summary too short - document may be too brief"
-def summarize_document(file, summary_length: str):
     """Main processing function for Gradio interface"""
     if file is None:
-        return "Please upload a document first", "Ready"
     file_path = file.name
     file_extension = file_path.split(".")[-1].lower()
     text, error = extract_text(file_path, file_extension)
     if error:
-        return error, "Error"
     if not text or len(text.split()) < 30:
-        return "Document is too short or contains too little text to summarize", "Ready"
     try:
         summary = generate_summary(text, summary_length)
-        return summary, "Summary complete"
     except Exception as e:
-        return f"Summarization error: {str(e)}", "Error"
 # Gradio Interface
 with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 📄 Document Summarizer")
-    gr.Markdown("Upload a document to generate a concise summary")
     with gr.Row():
         with gr.Column():
             file_input = gr.File(
                 label="Upload Document",
-                file_types=[".pdf", ".docx", ".pptx", ".xlsx"],
                 type="filepath"
             )
             length_radio = gr.Radio(
@@ -257,19 +289,41 @@ with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
                 value="medium",
                 label="Summary Length"
             )
             submit_btn = gr.Button("Generate Summary", variant="primary")
         with gr.Column():
             output = gr.Textbox(label="Summary", lines=10)
             status = gr.Textbox(label="Status", interactive=False)
     submit_btn.click(
         fn=summarize_document,
-        inputs=[file_input, length_radio],
-        outputs=[output, status],
         api_name="summarize"
     )
 # Mount Gradio app to FastAPI
 app = gr.mount_gradio_app(app, demo, path="/")

 from nltk.tokenize import sent_tokenize
 import torch
 from fastapi import FastAPI
+from fastapi.responses import RedirectResponse, FileResponse
+from gtts import gTTS
+import tempfile
+import os
+import easyocr
 # Download required NLTK data
 nltk.download('punkt', quiet=True)
 # Initialize components
 app = FastAPI()
+# Load models (CPU optimized)
 MODEL_NAME = "facebook/bart-large-cnn"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
     torch_dtype=torch.float32
 )
+# Initialize EasyOCR reader
+reader = easyocr.Reader(['en'])  # English only for faster initialization
 def clean_text(text: str) -> str:
     """Clean and normalize document text"""
     text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
     try:
         if file_extension == "pdf":
             with fitz.open(file_path) as doc:
+                text = "\n".join(page.get_text("text") for page in doc)
+                # Try OCR for scanned PDFs if text extraction fails
+                if len(text.strip()) < 50:
+                    images = [page.get_pixmap() for page in doc]
+                    temp_img = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
+                    images[0].save(temp_img.name)
+                    ocr_result = reader.readtext(temp_img.name, detail=0)
+                    os.unlink(temp_img.name)
+                    text = "\n".join(ocr_result) if ocr_result else text
+                return clean_text(text), ""
         elif file_extension == "docx":
             doc = docx.Document(file_path)
                     text.append(" ".join(str(cell) for cell in row if cell))
             return clean_text("\n".join(text)), ""
+        elif file_extension in ["jpg", "jpeg", "png"]:
+            ocr_result = reader.readtext(file_path, detail=0)
+            return clean_text("\n".join(ocr_result)), ""
         return "", "Unsupported file format"
     except Exception as e:
         return "", f"Error reading {file_extension.upper()} file: {str(e)}"
     final_summary = ". ".join(s.strip().capitalize() for s in final_summary.split(". ") if s.strip())
     return final_summary if len(final_summary) > 25 else "Summary too short - document may be too brief"
+def text_to_speech(text: str) -> str:
+    """Convert text to speech and return temporary audio file path"""
+    try:
+        tts = gTTS(text)
+        temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
+        tts.save(temp_audio.name)
+        return temp_audio.name
+    except Exception as e:
+        print(f"Error in text-to-speech: {e}")
+        return ""
+def summarize_document(file, summary_length: str, enable_tts: bool):
     """Main processing function for Gradio interface"""
     if file is None:
+        return "Please upload a document first", "Ready", None
     file_path = file.name
     file_extension = file_path.split(".")[-1].lower()
     text, error = extract_text(file_path, file_extension)
     if error:
+        return error, "Error", None
     if not text or len(text.split()) < 30:
+        return "Document is too short or contains too little text to summarize", "Ready", None
     try:
         summary = generate_summary(text, summary_length)
+        audio_path = text_to_speech(summary) if enable_tts else None
+        return summary, "Summary complete", audio_path
     except Exception as e:
+        return f"Summarization error: {str(e)}", "Error", None
 # Gradio Interface
 with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 📄 Advanced Document Summarizer")
+    gr.Markdown("Upload a document to generate a summary with optional audio reading")
     with gr.Row():
         with gr.Column():
             file_input = gr.File(
                 label="Upload Document",
+                file_types=[".pdf", ".docx", ".pptx", ".xlsx", ".jpg", ".jpeg", ".png"],
                 type="filepath"
             )
             length_radio = gr.Radio(
                 value="medium",
                 label="Summary Length"
             )
+            tts_checkbox = gr.Checkbox(
+                label="Enable Text-to-Speech",
+                value=False
+            )
             submit_btn = gr.Button("Generate Summary", variant="primary")
         with gr.Column():
             output = gr.Textbox(label="Summary", lines=10)
             status = gr.Textbox(label="Status", interactive=False)
+            audio_output = gr.Audio(label="Audio Summary", visible=False)
+    def toggle_audio_visibility(enable_tts):
+        return gr.Audio(visible=enable_tts)
+    tts_checkbox.change(
+        fn=toggle_audio_visibility,
+        inputs=tts_checkbox,
+        outputs=audio_output
+    )
     submit_btn.click(
         fn=summarize_document,
+        inputs=[file_input, length_radio, tts_checkbox],
+        outputs=[output, status, audio_output],
         api_name="summarize"
     )
+# FastAPI endpoints for audio files
+@app.get("/audio/{file_name}")
+async def get_audio(file_name: str):
+    file_path = os.path.join(tempfile.gettempdir(), file_name)
+    if os.path.exists(file_path):
+        return FileResponse(file_path, media_type="audio/mpeg")
+    return JSONResponse({"error": "File not found"}, status_code=404)
 # Mount Gradio app to FastAPI
 app = gr.mount_gradio_app(app, demo, path="/")