ikraamkb commited on
Commit
653c3ae
Β·
verified Β·
1 Parent(s): 3fb07d9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -57
app.py CHANGED
@@ -1,49 +1,47 @@
1
  from fastapi import FastAPI
2
  from fastapi.responses import RedirectResponse
3
- from transformers import pipeline
4
- from PIL import Image
5
  import fitz # PyMuPDF
6
  import docx
7
  import pptx
8
  import openpyxl
9
  import io
10
-
11
  import gradio as gr
 
12
 
13
- # Initialize models
14
  summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
15
  image_captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
16
 
17
- # FastAPI app
18
  app = FastAPI()
19
 
20
  # -------------------------
21
- # Helper Functions
22
  # -------------------------
23
 
24
- def extract_text_from_pdf(upload):
25
  try:
26
- file_bytes = upload.read()
27
- stream = io.BytesIO(file_bytes)
28
- with fitz.open(stream=stream, filetype="pdf") as doc:
29
  return "\n".join([page.get_text() for page in doc])
30
  except Exception as e:
31
- return f"❌ PDF extraction error: {e}"
32
 
33
- def extract_text_from_docx(upload):
34
  try:
35
- file_bytes = upload.read()
36
- stream = io.BytesIO(file_bytes)
37
- doc = docx.Document(stream)
38
  return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
39
  except Exception as e:
40
- return f"❌ DOCX extraction error: {e}"
41
 
42
- def extract_text_from_pptx(upload):
43
  try:
44
- file_bytes = upload.read()
45
- stream = io.BytesIO(file_bytes)
46
- prs = pptx.Presentation(stream)
47
  text = []
48
  for slide in prs.slides:
49
  for shape in slide.shapes:
@@ -51,46 +49,43 @@ def extract_text_from_pptx(upload):
51
  text.append(shape.text)
52
  return "\n".join(text)
53
  except Exception as e:
54
- return f"❌ PPTX extraction error: {e}"
55
 
56
- def extract_text_from_xlsx(upload):
57
  try:
58
- file_bytes = upload.read()
59
- stream = io.BytesIO(file_bytes)
60
- wb = openpyxl.load_workbook(stream)
61
  text = []
62
  for sheet in wb.sheetnames:
63
  ws = wb[sheet]
64
  for row in ws.iter_rows(values_only=True):
65
- text.append(" ".join(str(cell) for cell in row if cell))
 
66
  return "\n".join(text)
67
  except Exception as e:
68
- return f"❌ XLSX extraction error: {e}"
69
 
70
  # -------------------------
71
- # Core Functions
72
  # -------------------------
73
 
74
- def summarize_document(upload):
75
- if not upload:
76
- return "⚠️ No file uploaded."
77
-
78
- ext = upload.name.lower()
79
- upload.seek(0)
80
-
81
- if ext.endswith(".pdf"):
82
- text = extract_text_from_pdf(upload)
83
- elif ext.endswith(".docx"):
84
- text = extract_text_from_docx(upload)
85
- elif ext.endswith(".pptx"):
86
- text = extract_text_from_pptx(upload)
87
- elif ext.endswith(".xlsx"):
88
- text = extract_text_from_xlsx(upload)
89
  else:
90
- return "❌ Unsupported file type."
91
 
92
- if not text or not text.strip() or text.startswith("❌"):
93
- return text if text.startswith("❌") else "❗ No extractable text found."
94
 
95
  try:
96
  summary = summarizer(text[:3000], max_length=150, min_length=30, do_sample=False)
@@ -99,34 +94,36 @@ def summarize_document(upload):
99
  return f"⚠️ Summarization error: {e}"
100
 
101
  def interpret_image(image):
102
- if not image:
103
- return "⚠️ No image uploaded."
104
  try:
105
  return f"πŸ–ΌοΈ Caption:\n{image_captioner(image)[0]['generated_text']}"
106
  except Exception as e:
107
  return f"⚠️ Image captioning error: {e}"
108
 
109
  # -------------------------
110
- # Gradio Interface
111
  # -------------------------
112
 
113
- doc_ui = gr.Interface(
114
  fn=summarize_document,
115
- inputs=gr.File(label="Upload a Document (PDF, DOCX, PPTX, XLSX)"),
116
- outputs=gr.Textbox(label="Summary"),
117
  title="πŸ“„ Document Summarizer"
118
  )
119
 
120
- img_ui = gr.Interface(
121
  fn=interpret_image,
122
  inputs=gr.Image(type="pil", label="Upload an Image"),
123
- outputs=gr.Textbox(label="Caption"),
124
- title="πŸ–ΌοΈ Image Interpreter"
125
  )
126
 
127
- demo = gr.TabbedInterface([doc_ui, img_ui], ["Document Summarization", "Image Captioning"])
 
 
 
 
128
  app = gr.mount_gradio_app(app, demo, path="/")
129
 
130
  @app.get("/")
131
- def redirect_to_ui():
132
  return RedirectResponse(url="/")
 
1
  from fastapi import FastAPI
2
  from fastapi.responses import RedirectResponse
 
 
3
  import fitz # PyMuPDF
4
  import docx
5
  import pptx
6
  import openpyxl
7
  import io
8
+ from PIL import Image
9
  import gradio as gr
10
+ from transformers import pipeline
11
 
12
+ # Load models
13
  summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
14
  image_captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
15
 
 
16
  app = FastAPI()
17
 
18
  # -------------------------
19
+ # File Text Extractors
20
  # -------------------------
21
 
22
+ def extract_text_from_pdf(file):
23
  try:
24
+ file.seek(0)
25
+ data = file.read()
26
+ with fitz.open(stream=data, filetype="pdf") as doc:
27
  return "\n".join([page.get_text() for page in doc])
28
  except Exception as e:
29
+ return f"❌ PDF error: {e}"
30
 
31
+ def extract_text_from_docx(file):
32
  try:
33
+ file.seek(0)
34
+ data = file.read()
35
+ doc = docx.Document(io.BytesIO(data))
36
  return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
37
  except Exception as e:
38
+ return f"❌ DOCX error: {e}"
39
 
40
+ def extract_text_from_pptx(file):
41
  try:
42
+ file.seek(0)
43
+ data = file.read()
44
+ prs = pptx.Presentation(io.BytesIO(data))
45
  text = []
46
  for slide in prs.slides:
47
  for shape in slide.shapes:
 
49
  text.append(shape.text)
50
  return "\n".join(text)
51
  except Exception as e:
52
+ return f"❌ PPTX error: {e}"
53
 
54
+ def extract_text_from_xlsx(file):
55
  try:
56
+ file.seek(0)
57
+ data = file.read()
58
+ wb = openpyxl.load_workbook(io.BytesIO(data))
59
  text = []
60
  for sheet in wb.sheetnames:
61
  ws = wb[sheet]
62
  for row in ws.iter_rows(values_only=True):
63
+ line = " ".join(str(cell) for cell in row if cell)
64
+ text.append(line)
65
  return "\n".join(text)
66
  except Exception as e:
67
+ return f"❌ XLSX error: {e}"
68
 
69
  # -------------------------
70
+ # Main Logic
71
  # -------------------------
72
 
73
+ def summarize_document(file):
74
+ filename = file.name.lower()
75
+
76
+ if filename.endswith(".pdf"):
77
+ text = extract_text_from_pdf(file)
78
+ elif filename.endswith(".docx"):
79
+ text = extract_text_from_docx(file)
80
+ elif filename.endswith(".pptx"):
81
+ text = extract_text_from_pptx(file)
82
+ elif filename.endswith(".xlsx"):
83
+ text = extract_text_from_xlsx(file)
 
 
 
 
84
  else:
85
+ return "❌ Unsupported file format."
86
 
87
+ if not text.strip():
88
+ return "❗ No extractable text."
89
 
90
  try:
91
  summary = summarizer(text[:3000], max_length=150, min_length=30, do_sample=False)
 
94
  return f"⚠️ Summarization error: {e}"
95
 
96
  def interpret_image(image):
 
 
97
  try:
98
  return f"πŸ–ΌοΈ Caption:\n{image_captioner(image)[0]['generated_text']}"
99
  except Exception as e:
100
  return f"⚠️ Image captioning error: {e}"
101
 
102
  # -------------------------
103
+ # Gradio Interfaces
104
  # -------------------------
105
 
106
+ doc_summary = gr.Interface(
107
  fn=summarize_document,
108
+ inputs=gr.File(label="Upload a Document"),
109
+ outputs="text",
110
  title="πŸ“„ Document Summarizer"
111
  )
112
 
113
+ img_caption = gr.Interface(
114
  fn=interpret_image,
115
  inputs=gr.Image(type="pil", label="Upload an Image"),
116
+ outputs="text",
117
+ title="πŸ–ΌοΈ Image Captioning"
118
  )
119
 
120
+ # -------------------------
121
+ # Launch via FastAPI
122
+ # -------------------------
123
+
124
+ demo = gr.TabbedInterface([doc_summary, img_caption], ["Document Summary", "Image Captioning"])
125
  app = gr.mount_gradio_app(app, demo, path="/")
126
 
127
  @app.get("/")
128
+ def root():
129
  return RedirectResponse(url="/")