Spaces:

ikraamkb
/

Summarization

Building

App Files Files Community

ikraamkb commited on 17 days ago

Commit

95c2451

verified ·

1 Parent(s): a0f361a

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -36

app.py CHANGED Viewed

@@ -1,13 +1,11 @@
-from fastapi import FastAPI, UploadFile
 from fastapi.responses import RedirectResponse
 import fitz  # PyMuPDF
 import docx
 import openpyxl
 import pptx
-from PIL import Image
 import io
 import gradio as gr
 from transformers import pipeline
@@ -21,29 +19,28 @@ app = FastAPI()
 # -------------------------
 # Document Extraction Utils
 # -------------------------
-def extract_text_from_pdf(file):
-    file.seek(0)  # Reset stream position to beginning
-    with fitz.open(stream=file.read(), filetype="pdf") as doc:
-        text = ""
         for page in doc:
             text += page.get_text()
     return text
-def extract_text_from_docx(file):
-    doc = docx.Document(io.BytesIO(file.read()))
     return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
-def extract_text_from_pptx(file):
     text = []
-    prs = pptx.Presentation(io.BytesIO(file.read()))
     for slide in prs.slides:
         for shape in slide.shapes:
             if hasattr(shape, "text"):
                 text.append(shape.text)
     return "\n".join(text)
-def extract_text_from_xlsx(file):
-    wb = openpyxl.load_workbook(io.BytesIO(file.read()))
     text = []
     for sheet in wb.sheetnames:
         ws = wb[sheet]
@@ -53,39 +50,36 @@ def extract_text_from_xlsx(file):
     return "\n".join(text)
 def summarize_document(file):
-    import os
-    name = getattr(file, "name", "")
-    ext = os.path.splitext(name)[1].lower()
-    if ext == ".pdf":
-        text = extract_text_from_pdf(file)
-    elif ext == ".docx":
-        text = extract_text_from_docx(file)
-    elif ext == ".pptx":
-        text = extract_text_from_pptx(file)
-    elif ext == ".xlsx":
-        text = extract_text_from_xlsx(file)
     else:
-        return "Unsupported file format."
     if not text.strip():
-        return "No extractable text found."
-    text = text[:3000]
     try:
-        summary = summarizer(text, max_length=150, min_length=30, do_sample=False)
-        return summary[0]["summary_text"]
     except Exception as e:
-        return f"Summarization error: {e}"
 def interpret_image(image):
     if image is None:
         return "No image uploaded."
     try:
-        return image_captioner(image)[0]["generated_text"]
     except Exception as e:
-        return f"Image captioning error: {e}"
 # -------------------------
 # Gradio Interfaces
@@ -107,7 +101,7 @@ img_caption = gr.Interface(
 # -------------------------
 # Combine into Gradio + FastAPI
 # -------------------------
-demo = gr.TabbedInterface([doc_summary, img_caption], ["Document QA", "Image QA"])
 app = gr.mount_gradio_app(app, demo, path="/")
 @app.get("/")

+from fastapi import FastAPI
 from fastapi.responses import RedirectResponse
 import fitz  # PyMuPDF
 import docx
 import openpyxl
 import pptx
 import io
+from PIL import Image
 import gradio as gr
 from transformers import pipeline
 # -------------------------
 # Document Extraction Utils
 # -------------------------
+def extract_text_from_pdf(file_bytes):
+    text = ""
+    with fitz.open(stream=file_bytes, filetype="pdf") as doc:
         for page in doc:
             text += page.get_text()
     return text
+def extract_text_from_docx(file_bytes):
+    doc = docx.Document(io.BytesIO(file_bytes))
     return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
+def extract_text_from_pptx(file_bytes):
     text = []
+    prs = pptx.Presentation(io.BytesIO(file_bytes))
     for slide in prs.slides:
         for shape in slide.shapes:
             if hasattr(shape, "text"):
                 text.append(shape.text)
     return "\n".join(text)
+def extract_text_from_xlsx(file_bytes):
+    wb = openpyxl.load_workbook(io.BytesIO(file_bytes))
     text = []
     for sheet in wb.sheetnames:
         ws = wb[sheet]
     return "\n".join(text)
 def summarize_document(file):
+    file_bytes = file.read()
+    filename = getattr(file, "name", "").lower()
+    if filename.endswith(".pdf"):
+        text = extract_text_from_pdf(file_bytes)
+    elif filename.endswith(".docx"):
+        text = extract_text_from_docx(file_bytes)
+    elif filename.endswith(".pptx"):
+        text = extract_text_from_pptx(file_bytes)
+    elif filename.endswith(".xlsx"):
+        text = extract_text_from_xlsx(file_bytes)
     else:
+        return "❌ Unsupported file format."
     if not text.strip():
+        return "❗ No extractable text found."
     try:
+        summary = summarizer(text[:3000], max_length=150, min_length=30, do_sample=False)
+        return f"📄 Summary:\n{summary[0]['summary_text']}"
     except Exception as e:
+        return f"⚠️ Summarization error: {e}"
 def interpret_image(image):
     if image is None:
         return "No image uploaded."
     try:
+        return f"🖼️ Caption:\n{image_captioner(image)[0]['generated_text']}"
     except Exception as e:
+        return f"⚠️ Image captioning error: {e}"
 # -------------------------
 # Gradio Interfaces
 # -------------------------
 # Combine into Gradio + FastAPI
 # -------------------------
+demo = gr.TabbedInterface([doc_summary, img_caption], ["Document Summary", "Image Captioning"])
 app = gr.mount_gradio_app(app, demo, path="/")
 @app.get("/")