ikraamkb commited on
Commit
40485d4
Β·
verified Β·
1 Parent(s): 44d6661

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -33
app.py CHANGED
@@ -1,12 +1,10 @@
1
  from fastapi import FastAPI
2
  from fastapi.responses import RedirectResponse
3
- import fitz
4
  import docx
5
  import openpyxl
6
  import pptx
7
  import io
8
- import os
9
- import tempfile
10
  from PIL import Image
11
  import gradio as gr
12
  from transformers import pipeline
@@ -18,30 +16,28 @@ image_captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-cap
18
  app = FastAPI()
19
 
20
  # -------------------------
21
- # Extraction Functions
22
  # -------------------------
23
- def extract_text_from_pdf(file_bytes):
24
  try:
25
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
26
- tmp.write(file_bytes)
27
- tmp_path = tmp.name
28
- with fitz.open(tmp_path) as doc:
29
- text = "\n".join(page.get_text() for page in doc)
30
- os.unlink(tmp_path)
31
- return text
32
  except Exception as e:
33
  return f"❌ PDF extraction error: {e}"
34
 
35
- def extract_text_from_docx(file_bytes):
36
  try:
37
- doc = docx.Document(io.BytesIO(file_bytes))
 
38
  return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
39
  except Exception as e:
40
  return f"❌ DOCX extraction error: {e}"
41
 
42
- def extract_text_from_pptx(file_bytes):
43
  try:
44
- prs = pptx.Presentation(io.BytesIO(file_bytes))
 
45
  text = []
46
  for slide in prs.slides:
47
  for shape in slide.shapes:
@@ -51,15 +47,15 @@ def extract_text_from_pptx(file_bytes):
51
  except Exception as e:
52
  return f"❌ PPTX extraction error: {e}"
53
 
54
- def extract_text_from_xlsx(file_bytes):
55
  try:
56
- wb = openpyxl.load_workbook(io.BytesIO(file_bytes), read_only=True, data_only=True)
 
57
  text = []
58
  for sheet in wb.sheetnames:
59
  ws = wb[sheet]
60
  for row in ws.iter_rows(values_only=True):
61
- line = " ".join(str(cell) for cell in row if cell)
62
- text.append(line)
63
  return "\n".join(text)
64
  except Exception as e:
65
  return f"❌ XLSX extraction error: {e}"
@@ -68,21 +64,19 @@ def extract_text_from_xlsx(file_bytes):
68
  # Main Logic
69
  # -------------------------
70
  def summarize_document(file):
71
- file_bytes = file.read()
72
- filename = getattr(file, "name", "").lower()
73
-
74
- if filename.endswith(".pdf"):
75
- text = extract_text_from_pdf(file_bytes)
76
- elif filename.endswith(".docx"):
77
- text = extract_text_from_docx(file_bytes)
78
- elif filename.endswith(".pptx"):
79
- text = extract_text_from_pptx(file_bytes)
80
- elif filename.endswith(".xlsx"):
81
- text = extract_text_from_xlsx(file_bytes)
82
  else:
83
  return "❌ Unsupported file format."
84
 
85
- if not text or not text.strip():
86
  return "❗ No extractable text found."
87
 
88
  try:
@@ -115,7 +109,7 @@ img_caption = gr.Interface(
115
  )
116
 
117
  # -------------------------
118
- # FastAPI Integration
119
  # -------------------------
120
  demo = gr.TabbedInterface([doc_summary, img_caption], ["Document Summary", "Image Captioning"])
121
  app = gr.mount_gradio_app(app, demo, path="/")
 
1
  from fastapi import FastAPI
2
  from fastapi.responses import RedirectResponse
3
+ import fitz # PyMuPDF
4
  import docx
5
  import openpyxl
6
  import pptx
7
  import io
 
 
8
  from PIL import Image
9
  import gradio as gr
10
  from transformers import pipeline
 
16
  app = FastAPI()
17
 
18
  # -------------------------
19
+ # File Extraction Helpers
20
  # -------------------------
21
+ def extract_text_from_pdf(file_obj):
22
  try:
23
+ file_obj.seek(0)
24
+ with fitz.open(stream=file_obj.read(), filetype="pdf") as doc:
25
+ return "\n".join([page.get_text() for page in doc])
 
 
 
 
26
  except Exception as e:
27
  return f"❌ PDF extraction error: {e}"
28
 
29
+ def extract_text_from_docx(file_obj):
30
  try:
31
+ file_obj.seek(0)
32
+ doc = docx.Document(io.BytesIO(file_obj.read()))
33
  return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
34
  except Exception as e:
35
  return f"❌ DOCX extraction error: {e}"
36
 
37
+ def extract_text_from_pptx(file_obj):
38
  try:
39
+ file_obj.seek(0)
40
+ prs = pptx.Presentation(io.BytesIO(file_obj.read()))
41
  text = []
42
  for slide in prs.slides:
43
  for shape in slide.shapes:
 
47
  except Exception as e:
48
  return f"❌ PPTX extraction error: {e}"
49
 
50
+ def extract_text_from_xlsx(file_obj):
51
  try:
52
+ file_obj.seek(0)
53
+ wb = openpyxl.load_workbook(io.BytesIO(file_obj.read()))
54
  text = []
55
  for sheet in wb.sheetnames:
56
  ws = wb[sheet]
57
  for row in ws.iter_rows(values_only=True):
58
+ text.append(" ".join(str(cell) for cell in row if cell))
 
59
  return "\n".join(text)
60
  except Exception as e:
61
  return f"❌ XLSX extraction error: {e}"
 
64
  # Main Logic
65
  # -------------------------
66
  def summarize_document(file):
67
+ name = getattr(file, "name", "").lower()
68
+ if name.endswith(".pdf"):
69
+ text = extract_text_from_pdf(file)
70
+ elif name.endswith(".docx"):
71
+ text = extract_text_from_docx(file)
72
+ elif name.endswith(".pptx"):
73
+ text = extract_text_from_pptx(file)
74
+ elif name.endswith(".xlsx"):
75
+ text = extract_text_from_xlsx(file)
 
 
76
  else:
77
  return "❌ Unsupported file format."
78
 
79
+ if not text or not isinstance(text, str) or not text.strip():
80
  return "❗ No extractable text found."
81
 
82
  try:
 
109
  )
110
 
111
  # -------------------------
112
+ # Launch with FastAPI
113
  # -------------------------
114
  demo = gr.TabbedInterface([doc_summary, img_caption], ["Document Summary", "Image Captioning"])
115
  app = gr.mount_gradio_app(app, demo, path="/")