ikraamkb commited on
Commit
af32fa4
·
verified ·
1 Parent(s): 95c2451

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -28
app.py CHANGED
@@ -6,49 +6,60 @@ import openpyxl
6
  import pptx
7
  import io
8
  from PIL import Image
9
-
10
  import gradio as gr
11
  from transformers import pipeline
12
 
13
- # Models
14
  summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
15
  image_captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
16
 
17
  app = FastAPI()
18
 
19
  # -------------------------
20
- # Document Extraction Utils
21
  # -------------------------
22
  def extract_text_from_pdf(file_bytes):
23
- text = ""
24
- with fitz.open(stream=file_bytes, filetype="pdf") as doc:
25
- for page in doc:
26
- text += page.get_text()
27
- return text
28
 
29
  def extract_text_from_docx(file_bytes):
30
- doc = docx.Document(io.BytesIO(file_bytes))
31
- return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
 
 
 
32
 
33
  def extract_text_from_pptx(file_bytes):
34
- text = []
35
- prs = pptx.Presentation(io.BytesIO(file_bytes))
36
- for slide in prs.slides:
37
- for shape in slide.shapes:
38
- if hasattr(shape, "text"):
39
- text.append(shape.text)
40
- return "\n".join(text)
 
 
 
41
 
42
  def extract_text_from_xlsx(file_bytes):
43
- wb = openpyxl.load_workbook(io.BytesIO(file_bytes))
44
- text = []
45
- for sheet in wb.sheetnames:
46
- ws = wb[sheet]
47
- for row in ws.iter_rows(values_only=True):
48
- line = " ".join(str(cell) for cell in row if cell)
49
- text.append(line)
50
- return "\n".join(text)
 
 
 
51
 
 
 
 
52
  def summarize_document(file):
53
  file_bytes = file.read()
54
  filename = getattr(file, "name", "").lower()
@@ -74,8 +85,6 @@ def summarize_document(file):
74
  return f"⚠️ Summarization error: {e}"
75
 
76
  def interpret_image(image):
77
- if image is None:
78
- return "No image uploaded."
79
  try:
80
  return f"🖼️ Caption:\n{image_captioner(image)[0]['generated_text']}"
81
  except Exception as e:
@@ -99,7 +108,7 @@ img_caption = gr.Interface(
99
  )
100
 
101
  # -------------------------
102
- # Combine into Gradio + FastAPI
103
  # -------------------------
104
  demo = gr.TabbedInterface([doc_summary, img_caption], ["Document Summary", "Image Captioning"])
105
  app = gr.mount_gradio_app(app, demo, path="/")
 
6
  import pptx
7
  import io
8
  from PIL import Image
 
9
  import gradio as gr
10
  from transformers import pipeline
11
 
12
+ # Load models
13
  summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
14
  image_captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
15
 
16
  app = FastAPI()
17
 
18
  # -------------------------
19
+ # Extraction Functions
20
  # -------------------------
21
  def extract_text_from_pdf(file_bytes):
22
+ try:
23
+ with fitz.open(stream=file_bytes, filetype="pdf") as doc:
24
+ return "\n".join([page.get_text() for page in doc])
25
+ except Exception as e:
26
+ return f"❌ PDF extraction error: {e}"
27
 
28
  def extract_text_from_docx(file_bytes):
29
+ try:
30
+ doc = docx.Document(io.BytesIO(file_bytes))
31
+ return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
32
+ except Exception as e:
33
+ return f"❌ DOCX extraction error: {e}"
34
 
35
  def extract_text_from_pptx(file_bytes):
36
+ try:
37
+ prs = pptx.Presentation(io.BytesIO(file_bytes))
38
+ text = []
39
+ for slide in prs.slides:
40
+ for shape in slide.shapes:
41
+ if hasattr(shape, "text"):
42
+ text.append(shape.text)
43
+ return "\n".join(text)
44
+ except Exception as e:
45
+ return f"❌ PPTX extraction error: {e}"
46
 
47
  def extract_text_from_xlsx(file_bytes):
48
+ try:
49
+ wb = openpyxl.load_workbook(io.BytesIO(file_bytes))
50
+ text = []
51
+ for sheet in wb.sheetnames:
52
+ ws = wb[sheet]
53
+ for row in ws.iter_rows(values_only=True):
54
+ line = " ".join(str(cell) for cell in row if cell)
55
+ text.append(line)
56
+ return "\n".join(text)
57
+ except Exception as e:
58
+ return f"❌ XLSX extraction error: {e}"
59
 
60
+ # -------------------------
61
+ # Main Logic
62
+ # -------------------------
63
  def summarize_document(file):
64
  file_bytes = file.read()
65
  filename = getattr(file, "name", "").lower()
 
85
  return f"⚠️ Summarization error: {e}"
86
 
87
  def interpret_image(image):
 
 
88
  try:
89
  return f"🖼️ Caption:\n{image_captioner(image)[0]['generated_text']}"
90
  except Exception as e:
 
108
  )
109
 
110
  # -------------------------
111
+ # Launch with FastAPI
112
  # -------------------------
113
  demo = gr.TabbedInterface([doc_summary, img_caption], ["Document Summary", "Image Captioning"])
114
  app = gr.mount_gradio_app(app, demo, path="/")