Spaces:

ikraamkb
/

Summarization

Running

App Files Files Community

ikraamkb commited on 20 days ago

Commit

822dc40

verified ·

1 Parent(s): 6dfac5c

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -42

app.py CHANGED Viewed

@@ -5,6 +5,8 @@ import docx
 import openpyxl
 import pptx
 import io
 from PIL import Image
 import gradio as gr
 from transformers import pipeline
@@ -18,10 +20,16 @@ app = FastAPI()
 # -------------------------
 # Extraction Functions
 # -------------------------
-"""def extract_text_from_pdf(file_bytes):
     try:
-        with fitz.open(stream=file_bytes, filetype="pdf") as doc:
-            return "\n".join([page.get_text() for page in doc])
     except Exception as e:
         return f"❌ PDF extraction error: {e}"
@@ -46,7 +54,7 @@ def extract_text_from_pptx(file_bytes):
 def extract_text_from_xlsx(file_bytes):
     try:
-        wb = openpyxl.load_workbook(io.BytesIO(file_bytes))
         text = []
         for sheet in wb.sheetnames:
             ws = wb[sheet]
@@ -56,44 +64,7 @@ def extract_text_from_xlsx(file_bytes):
         return "\n".join(text)
     except Exception as e:
         return f"❌ XLSX extraction error: {e}"
-"""
-def extract_text_from_pdf(pdf_file):
-    text = []
-    try:
-        with fitz.open(pdf_file) as doc:
-            for page in doc:
-                text.append(page.get_text("text"))
-    except Exception as e:
-        return f"Error reading PDF: {e}"
-    return "\n".join(text)
-def extract_text_from_docx(docx_file):
-    doc = docx.Document(docx_file)
-    return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
-def extract_text_from_pptx(pptx_file):
-    text = []
-    try:
-        presentation = pptx.Presentation(pptx_file)
-        for slide in presentation.slides:
-            for shape in slide.shapes:
-                if hasattr(shape, "text"):
-                    text.append(shape.text)
-    except Exception as e:
-        return f"Error reading PPTX: {e}"
-    return "\n".join(text)
-def extract_text_from_xlsx(xlsx_file):
-    text = []
-    try:
-        wb = openpyxl.load_workbook(xlsx_file)
-        for sheet in wb.sheetnames:
-            ws = wb[sheet]
-            for row in ws.iter_rows(values_only=True):
-                text.append(" ".join(str(cell) for cell in row if cell))
-    except Exception as e:
-        return f"Error reading XLSX: {e}"
-    return "\n".join(text)
 # -------------------------
 # Main Logic
 # -------------------------
@@ -112,7 +83,7 @@ def summarize_document(file):
     else:
         return "❌ Unsupported file format."
-    if not text.strip():
         return "❗ No extractable text found."
     try:

 import openpyxl
 import pptx
 import io
+import os
+import tempfile
 from PIL import Image
 import gradio as gr
 from transformers import pipeline
 # -------------------------
 # Extraction Functions
 # -------------------------
+def extract_text_from_pdf(file_bytes):
     try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+            tmp.write(file_bytes)
+            tmp_path = tmp.name
+        with fitz.open(tmp_path) as doc:
+            text = "\n".join(page.get_text() for page in doc)
+        os.unlink(tmp_path)
+        return text
     except Exception as e:
         return f"❌ PDF extraction error: {e}"
 def extract_text_from_xlsx(file_bytes):
     try:
+        wb = openpyxl.load_workbook(io.BytesIO(file_bytes), read_only=True, data_only=True)
         text = []
         for sheet in wb.sheetnames:
             ws = wb[sheet]
         return "\n".join(text)
     except Exception as e:
         return f"❌ XLSX extraction error: {e}"
 # -------------------------
 # Main Logic
 # -------------------------
     else:
         return "❌ Unsupported file format."
+    if not text or not text.strip():
         return "❗ No extractable text found."
     try: