Spaces:

ikraamkb
/

Summarization

Running

App Files Files Community

ikraamkb commited on 7 days ago

Commit

3a44bf2

verified ·

1 Parent(s): c3071ac

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -95

app.py CHANGED Viewed

@@ -1,115 +1,92 @@
-from fastapi import FastAPI, UploadFile, File
-from fastapi.responses import RedirectResponse
 import gradio as gr
-from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
-import tempfile
-import os
-from PIL import Image
 import fitz  # PyMuPDF
 import docx
-import easyocr
-app = FastAPI()
-# Lightweight model choices
-SUMMARIZATION_MODEL = "facebook/bart-large-cnn"  # 500MB
-IMAGE_CAPTIONING_MODEL = "Salesforce/blip-image-captioning-base"  # 300MB
-# Initialize models
-try:
-    summarizer = pipeline(
-        "summarization",
-        model=SUMMARIZATION_MODEL,
-        device="cpu"
-    )
-except Exception as e:
-    print(f"Error loading summarizer: {e}")
-    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")  # Fallback 250MB model
-captioner = pipeline(
-    "image-to-text",
-    model=IMAGE_CAPTIONING_MODEL,
-    device="cpu"
-)
-reader = easyocr.Reader(['en'])  # Lightweight OCR
-def extract_text_from_file(file_path: str, file_type: str):
-    """Extract text from different document formats"""
-    try:
-        if file_type == "pdf":
-            with fitz.open(file_path) as doc:
-                return "\n".join(page.get_text() for page in doc)
-        elif file_type == "docx":
-            doc = docx.Document(file_path)
-            return "\n".join(p.text for p in doc.paragraphs)
-        else:
-            return "Unsupported file format (only PDF/DOCX supported in lightweight version)"
-    except Exception as e:
-        return f"Error reading file: {str(e)}"
-def process_document(file):
-    """Handle document summarization"""
-    try:
-        file_ext = os.path.splitext(file.name)[1][1:].lower()
-        if file_ext not in ["pdf", "docx"]:
-            return "Lightweight version only supports PDF and DOCX"
-        with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_ext}") as tmp:
-            tmp.write(file.read())
-            tmp_path = tmp.name
-        text = extract_text_from_file(tmp_path, file_ext)
-        summary = summarizer(text, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
-        os.unlink(tmp_path)
-        return summary
-    except Exception as e:
-        return f"Processing error: {str(e)}"
-def process_image(image):
-    """Handle image captioning and OCR"""
     try:
-        img = Image.open(image)
-        # Get caption
-        caption = captioner(img)[0]['generated_text']
-        # Get OCR text
-        ocr_result = reader.readtext(img)
-        ocr_text = " ".join([res[1] for res in ocr_result])
-        return {
-            "caption": caption,
-            "ocr_text": ocr_text if ocr_text else "No readable text found"
-        }
     except Exception as e:
-        return {"error": str(e)}
 # Gradio Interface
-with gr.Blocks(title="Lightweight Document & Image Analysis") as demo:
-    gr.Markdown("## 📄 Lightweight Document & Image Analysis")
-    with gr.Tab("Document Summarization"):
-        gr.Markdown("Supports PDF and DOCX files (max 10MB)")
-        doc_input = gr.File(label="Upload Document", file_types=[".pdf", ".docx"])
-        doc_output = gr.Textbox(label="Summary")
-        doc_button = gr.Button("Summarize")
-    with gr.Tab("Image Analysis"):
-        gr.Markdown("Get captions and extracted text from images")
-        img_input = gr.Image(type="filepath", label="Upload Image")
-        with gr.Accordion("Results", open=False):
-            caption_output = gr.Textbox(label="Image Caption")
-            ocr_output = gr.Textbox(label="Extracted Text")
-        img_button = gr.Button("Analyze")
-    doc_button.click(process_document, inputs=doc_input, outputs=doc_output)
-    img_button.click(process_image, inputs=img_input, outputs=[caption_output, ocr_output])
-# Mount Gradio app
 app = gr.mount_gradio_app(app, demo, path="/")
 @app.get("/")
 def redirect_to_interface():
-    return RedirectResponse(url="/")

 import gradio as gr
+from transformers import pipeline
 import fitz  # PyMuPDF
 import docx
+import pptx
+import openpyxl
+import os
+from fastapi import FastAPI
+from fastapi.responses import RedirectResponse
+# Load your custom summarization model
+pipe = pipeline("text2text-generation", model="FeruzaBoynazarovaas/my_awesome_billsum_model")
+# Document text extraction function
+def extract_text(file):
+    ext = file.name.split(".")[-1].lower()
+    path = file.name
+    if ext == "pdf":
+        try:
+            with fitz.open(path) as doc:
+                return "\n".join([page.get_text("text") for page in doc])
+        except Exception as e:
+            return f"Error reading PDF: {e}"
+    elif ext == "docx":
+        try:
+            doc = docx.Document(path)
+            return "\n".join([p.text for p in doc.paragraphs])
+        except Exception as e:
+            return f"Error reading DOCX: {e}"
+    elif ext == "pptx":
+        try:
+            prs = pptx.Presentation(path)
+            text = ""
+            for slide in prs.slides:
+                for shape in slide.shapes:
+                    if hasattr(shape, "text"):
+                        text += shape.text + "\n"
+            return text
+        except Exception as e:
+            return f"Error reading PPTX: {e}"
+    elif ext == "xlsx":
+        try:
+            wb = openpyxl.load_workbook(path)
+            text = ""
+            for sheet in wb.sheetnames:
+                for row in wb[sheet].iter_rows(values_only=True):
+                    text += " ".join([str(cell) for cell in row if cell]) + "\n"
+            return text
+        except Exception as e:
+            return f"Error reading XLSX: {e}"
+    else:
+        return "Unsupported file format"
+# Summarization logic
+def summarize_document(file):
+    text = extract_text(file)
+    if "Error" in text or "Unsupported" in text:
+        return text
+    word_count = len(text.split())
+    max_summary_len = max(20, int(word_count * 0.2))
     try:
+        summary = pipe(text, max_length=max_summary_len, min_length=int(max_summary_len * 0.6), do_sample=False)
+        return summary[0]['generated_text']
     except Exception as e:
+        return f"Error during summarization: {e}"
 # Gradio Interface
+demo = gr.Interface(
+    fn=summarize_document,
+    inputs=gr.File(label="Upload a document (PDF, DOCX, PPTX, XLSX)", file_types=[".pdf", ".docx", ".pptx", ".xlsx"]),
+    outputs=gr.Textbox(label="20% Summary"),
+    title="📄 Document Summarizer (20% Length)",
+    description="Upload a document and get a concise summary generated by your custom Hugging Face model."
+)
+# FastAPI setup
+app = FastAPI()
+# Mount Gradio at "/"
 app = gr.mount_gradio_app(app, demo, path="/")
+# Optional root redirect
 @app.get("/")
 def redirect_to_interface():
+    return RedirectResponse(url="/")