ikraamkb commited on
Commit
12d05c0
Β·
verified Β·
1 Parent(s): 653c3ae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -32
app.py CHANGED
@@ -16,31 +16,25 @@ image_captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-cap
16
  app = FastAPI()
17
 
18
  # -------------------------
19
- # File Text Extractors
20
  # -------------------------
21
 
22
- def extract_text_from_pdf(file):
23
  try:
24
- file.seek(0)
25
- data = file.read()
26
  with fitz.open(stream=data, filetype="pdf") as doc:
27
  return "\n".join([page.get_text() for page in doc])
28
  except Exception as e:
29
  return f"❌ PDF error: {e}"
30
 
31
- def extract_text_from_docx(file):
32
  try:
33
- file.seek(0)
34
- data = file.read()
35
  doc = docx.Document(io.BytesIO(data))
36
  return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
37
  except Exception as e:
38
  return f"❌ DOCX error: {e}"
39
 
40
- def extract_text_from_pptx(file):
41
  try:
42
- file.seek(0)
43
- data = file.read()
44
  prs = pptx.Presentation(io.BytesIO(data))
45
  text = []
46
  for slide in prs.slides:
@@ -51,10 +45,8 @@ def extract_text_from_pptx(file):
51
  except Exception as e:
52
  return f"❌ PPTX error: {e}"
53
 
54
- def extract_text_from_xlsx(file):
55
  try:
56
- file.seek(0)
57
- data = file.read()
58
  wb = openpyxl.load_workbook(io.BytesIO(data))
59
  text = []
60
  for sheet in wb.sheetnames:
@@ -71,27 +63,29 @@ def extract_text_from_xlsx(file):
71
  # -------------------------
72
 
73
  def summarize_document(file):
74
- filename = file.name.lower()
75
-
76
- if filename.endswith(".pdf"):
77
- text = extract_text_from_pdf(file)
78
- elif filename.endswith(".docx"):
79
- text = extract_text_from_docx(file)
80
- elif filename.endswith(".pptx"):
81
- text = extract_text_from_pptx(file)
82
- elif filename.endswith(".xlsx"):
83
- text = extract_text_from_xlsx(file)
84
- else:
85
- return "❌ Unsupported file format."
86
-
87
- if not text.strip():
88
- return "❗ No extractable text."
89
-
90
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  summary = summarizer(text[:3000], max_length=150, min_length=30, do_sample=False)
92
  return f"πŸ“„ Summary:\n{summary[0]['summary_text']}"
 
93
  except Exception as e:
94
- return f"⚠️ Summarization error: {e}"
95
 
96
  def interpret_image(image):
97
  try:
@@ -114,11 +108,11 @@ img_caption = gr.Interface(
114
  fn=interpret_image,
115
  inputs=gr.Image(type="pil", label="Upload an Image"),
116
  outputs="text",
117
- title="πŸ–ΌοΈ Image Captioning"
118
  )
119
 
120
  # -------------------------
121
- # Launch via FastAPI
122
  # -------------------------
123
 
124
  demo = gr.TabbedInterface([doc_summary, img_caption], ["Document Summary", "Image Captioning"])
 
16
  app = FastAPI()
17
 
18
  # -------------------------
19
+ # Extraction Functions
20
  # -------------------------
21
 
22
+ def extract_text_from_pdf(data: bytes):
23
  try:
 
 
24
  with fitz.open(stream=data, filetype="pdf") as doc:
25
  return "\n".join([page.get_text() for page in doc])
26
  except Exception as e:
27
  return f"❌ PDF error: {e}"
28
 
29
+ def extract_text_from_docx(data: bytes):
30
  try:
 
 
31
  doc = docx.Document(io.BytesIO(data))
32
  return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
33
  except Exception as e:
34
  return f"❌ DOCX error: {e}"
35
 
36
+ def extract_text_from_pptx(data: bytes):
37
  try:
 
 
38
  prs = pptx.Presentation(io.BytesIO(data))
39
  text = []
40
  for slide in prs.slides:
 
45
  except Exception as e:
46
  return f"❌ PPTX error: {e}"
47
 
48
+ def extract_text_from_xlsx(data: bytes):
49
  try:
 
 
50
  wb = openpyxl.load_workbook(io.BytesIO(data))
51
  text = []
52
  for sheet in wb.sheetnames:
 
63
  # -------------------------
64
 
65
  def summarize_document(file):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  try:
67
+ filename = file.name.lower()
68
+ data = file.read()
69
+
70
+ if filename.endswith(".pdf"):
71
+ text = extract_text_from_pdf(data)
72
+ elif filename.endswith(".docx"):
73
+ text = extract_text_from_docx(data)
74
+ elif filename.endswith(".pptx"):
75
+ text = extract_text_from_pptx(data)
76
+ elif filename.endswith(".xlsx"):
77
+ text = extract_text_from_xlsx(data)
78
+ else:
79
+ return "❌ Unsupported file format."
80
+
81
+ if not isinstance(text, str) or not text.strip():
82
+ return "❗ No extractable text."
83
+
84
  summary = summarizer(text[:3000], max_length=150, min_length=30, do_sample=False)
85
  return f"πŸ“„ Summary:\n{summary[0]['summary_text']}"
86
+
87
  except Exception as e:
88
+ return f"⚠️ Unexpected error: {e}"
89
 
90
  def interpret_image(image):
91
  try:
 
108
  fn=interpret_image,
109
  inputs=gr.Image(type="pil", label="Upload an Image"),
110
  outputs="text",
111
+ title="πŸ–ΌοΈ Image Interpreter"
112
  )
113
 
114
  # -------------------------
115
+ # FastAPI + Gradio Mount
116
  # -------------------------
117
 
118
  demo = gr.TabbedInterface([doc_summary, img_caption], ["Document Summary", "Image Captioning"])