Spaces:

ikraamkb
/

Summarization

Running

App Files Files Community

ikraamkb commited on 19 days ago

Commit

40485d4

verified ·

1 Parent(s): 44d6661

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -33

app.py CHANGED Viewed

@@ -1,12 +1,10 @@
 from fastapi import FastAPI
 from fastapi.responses import RedirectResponse
-import fitz
 import docx
 import openpyxl
 import pptx
 import io
-import os
-import tempfile
 from PIL import Image
 import gradio as gr
 from transformers import pipeline
@@ -18,30 +16,28 @@ image_captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-cap
 app = FastAPI()
 # -------------------------
-# Extraction Functions
 # -------------------------
-def extract_text_from_pdf(file_bytes):
     try:
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
-            tmp.write(file_bytes)
-            tmp_path = tmp.name
-        with fitz.open(tmp_path) as doc:
-            text = "\n".join(page.get_text() for page in doc)
-        os.unlink(tmp_path)
-        return text
     except Exception as e:
         return f"❌ PDF extraction error: {e}"
-def extract_text_from_docx(file_bytes):
     try:
-        doc = docx.Document(io.BytesIO(file_bytes))
         return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
     except Exception as e:
         return f"❌ DOCX extraction error: {e}"
-def extract_text_from_pptx(file_bytes):
     try:
-        prs = pptx.Presentation(io.BytesIO(file_bytes))
         text = []
         for slide in prs.slides:
             for shape in slide.shapes:
@@ -51,15 +47,15 @@ def extract_text_from_pptx(file_bytes):
     except Exception as e:
         return f"❌ PPTX extraction error: {e}"
-def extract_text_from_xlsx(file_bytes):
     try:
-        wb = openpyxl.load_workbook(io.BytesIO(file_bytes), read_only=True, data_only=True)
         text = []
         for sheet in wb.sheetnames:
             ws = wb[sheet]
             for row in ws.iter_rows(values_only=True):
-                line = " ".join(str(cell) for cell in row if cell)
-                text.append(line)
         return "\n".join(text)
     except Exception as e:
         return f"❌ XLSX extraction error: {e}"
@@ -68,21 +64,19 @@ def extract_text_from_xlsx(file_bytes):
 # Main Logic
 # -------------------------
 def summarize_document(file):
-    file_bytes = file.read()
-    filename = getattr(file, "name", "").lower()
-    if filename.endswith(".pdf"):
-        text = extract_text_from_pdf(file_bytes)
-    elif filename.endswith(".docx"):
-        text = extract_text_from_docx(file_bytes)
-    elif filename.endswith(".pptx"):
-        text = extract_text_from_pptx(file_bytes)
-    elif filename.endswith(".xlsx"):
-        text = extract_text_from_xlsx(file_bytes)
     else:
         return "❌ Unsupported file format."
-    if not text or not text.strip():
         return "❗ No extractable text found."
     try:
@@ -115,7 +109,7 @@ img_caption = gr.Interface(
 )
 # -------------------------
-# FastAPI Integration
 # -------------------------
 demo = gr.TabbedInterface([doc_summary, img_caption], ["Document Summary", "Image Captioning"])
 app = gr.mount_gradio_app(app, demo, path="/")

 from fastapi import FastAPI
 from fastapi.responses import RedirectResponse
+import fitz  # PyMuPDF
 import docx
 import openpyxl
 import pptx
 import io
 from PIL import Image
 import gradio as gr
 from transformers import pipeline
 app = FastAPI()
 # -------------------------
+# File Extraction Helpers
 # -------------------------
+def extract_text_from_pdf(file_obj):
     try:
+        file_obj.seek(0)
+        with fitz.open(stream=file_obj.read(), filetype="pdf") as doc:
+            return "\n".join([page.get_text() for page in doc])
     except Exception as e:
         return f"❌ PDF extraction error: {e}"
+def extract_text_from_docx(file_obj):
     try:
+        file_obj.seek(0)
+        doc = docx.Document(io.BytesIO(file_obj.read()))
         return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
     except Exception as e:
         return f"❌ DOCX extraction error: {e}"
+def extract_text_from_pptx(file_obj):
     try:
+        file_obj.seek(0)
+        prs = pptx.Presentation(io.BytesIO(file_obj.read()))
         text = []
         for slide in prs.slides:
             for shape in slide.shapes:
     except Exception as e:
         return f"❌ PPTX extraction error: {e}"
+def extract_text_from_xlsx(file_obj):
     try:
+        file_obj.seek(0)
+        wb = openpyxl.load_workbook(io.BytesIO(file_obj.read()))
         text = []
         for sheet in wb.sheetnames:
             ws = wb[sheet]
             for row in ws.iter_rows(values_only=True):
+                text.append(" ".join(str(cell) for cell in row if cell))
         return "\n".join(text)
     except Exception as e:
         return f"❌ XLSX extraction error: {e}"
 # Main Logic
 # -------------------------
 def summarize_document(file):
+    name = getattr(file, "name", "").lower()
+    if name.endswith(".pdf"):
+        text = extract_text_from_pdf(file)
+    elif name.endswith(".docx"):
+        text = extract_text_from_docx(file)
+    elif name.endswith(".pptx"):
+        text = extract_text_from_pptx(file)
+    elif name.endswith(".xlsx"):
+        text = extract_text_from_xlsx(file)
     else:
         return "❌ Unsupported file format."
+    if not text or not isinstance(text, str) or not text.strip():
         return "❗ No extractable text found."
     try:
 )
 # -------------------------
+# Launch with FastAPI
 # -------------------------
 demo = gr.TabbedInterface([doc_summary, img_caption], ["Document Summary", "Image Captioning"])
 app = gr.mount_gradio_app(app, demo, path="/")