ikraamkb commited on
Commit
95c2451
·
verified ·
1 Parent(s): a0f361a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -36
app.py CHANGED
@@ -1,13 +1,11 @@
1
- from fastapi import FastAPI, UploadFile
2
  from fastapi.responses import RedirectResponse
3
  import fitz # PyMuPDF
4
  import docx
5
  import openpyxl
6
  import pptx
7
- from PIL import Image
8
  import io
9
-
10
-
11
 
12
  import gradio as gr
13
  from transformers import pipeline
@@ -21,29 +19,28 @@ app = FastAPI()
21
  # -------------------------
22
  # Document Extraction Utils
23
  # -------------------------
24
- def extract_text_from_pdf(file):
25
- file.seek(0) # Reset stream position to beginning
26
- with fitz.open(stream=file.read(), filetype="pdf") as doc:
27
- text = ""
28
  for page in doc:
29
  text += page.get_text()
30
  return text
31
 
32
- def extract_text_from_docx(file):
33
- doc = docx.Document(io.BytesIO(file.read()))
34
  return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
35
 
36
- def extract_text_from_pptx(file):
37
  text = []
38
- prs = pptx.Presentation(io.BytesIO(file.read()))
39
  for slide in prs.slides:
40
  for shape in slide.shapes:
41
  if hasattr(shape, "text"):
42
  text.append(shape.text)
43
  return "\n".join(text)
44
 
45
- def extract_text_from_xlsx(file):
46
- wb = openpyxl.load_workbook(io.BytesIO(file.read()))
47
  text = []
48
  for sheet in wb.sheetnames:
49
  ws = wb[sheet]
@@ -53,39 +50,36 @@ def extract_text_from_xlsx(file):
53
  return "\n".join(text)
54
 
55
  def summarize_document(file):
56
- import os
57
-
58
- name = getattr(file, "name", "")
59
- ext = os.path.splitext(name)[1].lower()
60
-
61
- if ext == ".pdf":
62
- text = extract_text_from_pdf(file)
63
- elif ext == ".docx":
64
- text = extract_text_from_docx(file)
65
- elif ext == ".pptx":
66
- text = extract_text_from_pptx(file)
67
- elif ext == ".xlsx":
68
- text = extract_text_from_xlsx(file)
69
  else:
70
- return "Unsupported file format."
71
 
72
  if not text.strip():
73
- return "No extractable text found."
74
 
75
- text = text[:3000]
76
  try:
77
- summary = summarizer(text, max_length=150, min_length=30, do_sample=False)
78
- return summary[0]["summary_text"]
79
  except Exception as e:
80
- return f"Summarization error: {e}"
81
 
82
  def interpret_image(image):
83
  if image is None:
84
  return "No image uploaded."
85
  try:
86
- return image_captioner(image)[0]["generated_text"]
87
  except Exception as e:
88
- return f"Image captioning error: {e}"
89
 
90
  # -------------------------
91
  # Gradio Interfaces
@@ -107,7 +101,7 @@ img_caption = gr.Interface(
107
  # -------------------------
108
  # Combine into Gradio + FastAPI
109
  # -------------------------
110
- demo = gr.TabbedInterface([doc_summary, img_caption], ["Document QA", "Image QA"])
111
  app = gr.mount_gradio_app(app, demo, path="/")
112
 
113
  @app.get("/")
 
1
+ from fastapi import FastAPI
2
  from fastapi.responses import RedirectResponse
3
  import fitz # PyMuPDF
4
  import docx
5
  import openpyxl
6
  import pptx
 
7
  import io
8
+ from PIL import Image
 
9
 
10
  import gradio as gr
11
  from transformers import pipeline
 
19
  # -------------------------
20
  # Document Extraction Utils
21
  # -------------------------
22
+ def extract_text_from_pdf(file_bytes):
23
+ text = ""
24
+ with fitz.open(stream=file_bytes, filetype="pdf") as doc:
 
25
  for page in doc:
26
  text += page.get_text()
27
  return text
28
 
29
+ def extract_text_from_docx(file_bytes):
30
+ doc = docx.Document(io.BytesIO(file_bytes))
31
  return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
32
 
33
+ def extract_text_from_pptx(file_bytes):
34
  text = []
35
+ prs = pptx.Presentation(io.BytesIO(file_bytes))
36
  for slide in prs.slides:
37
  for shape in slide.shapes:
38
  if hasattr(shape, "text"):
39
  text.append(shape.text)
40
  return "\n".join(text)
41
 
42
+ def extract_text_from_xlsx(file_bytes):
43
+ wb = openpyxl.load_workbook(io.BytesIO(file_bytes))
44
  text = []
45
  for sheet in wb.sheetnames:
46
  ws = wb[sheet]
 
50
  return "\n".join(text)
51
 
52
  def summarize_document(file):
53
+ file_bytes = file.read()
54
+ filename = getattr(file, "name", "").lower()
55
+
56
+ if filename.endswith(".pdf"):
57
+ text = extract_text_from_pdf(file_bytes)
58
+ elif filename.endswith(".docx"):
59
+ text = extract_text_from_docx(file_bytes)
60
+ elif filename.endswith(".pptx"):
61
+ text = extract_text_from_pptx(file_bytes)
62
+ elif filename.endswith(".xlsx"):
63
+ text = extract_text_from_xlsx(file_bytes)
 
 
64
  else:
65
+ return "Unsupported file format."
66
 
67
  if not text.strip():
68
+ return "No extractable text found."
69
 
 
70
  try:
71
+ summary = summarizer(text[:3000], max_length=150, min_length=30, do_sample=False)
72
+ return f"📄 Summary:\n{summary[0]['summary_text']}"
73
  except Exception as e:
74
+ return f"⚠️ Summarization error: {e}"
75
 
76
  def interpret_image(image):
77
  if image is None:
78
  return "No image uploaded."
79
  try:
80
+ return f"🖼️ Caption:\n{image_captioner(image)[0]['generated_text']}"
81
  except Exception as e:
82
+ return f"⚠️ Image captioning error: {e}"
83
 
84
  # -------------------------
85
  # Gradio Interfaces
 
101
  # -------------------------
102
  # Combine into Gradio + FastAPI
103
  # -------------------------
104
+ demo = gr.TabbedInterface([doc_summary, img_caption], ["Document Summary", "Image Captioning"])
105
  app = gr.mount_gradio_app(app, demo, path="/")
106
 
107
  @app.get("/")