ikraamkb commited on
Commit
822dc40
Β·
verified Β·
1 Parent(s): 6dfac5c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -42
app.py CHANGED
@@ -5,6 +5,8 @@ import docx
5
  import openpyxl
6
  import pptx
7
  import io
 
 
8
  from PIL import Image
9
  import gradio as gr
10
  from transformers import pipeline
@@ -18,10 +20,16 @@ app = FastAPI()
18
  # -------------------------
19
  # Extraction Functions
20
  # -------------------------
21
- """def extract_text_from_pdf(file_bytes):
22
  try:
23
- with fitz.open(stream=file_bytes, filetype="pdf") as doc:
24
- return "\n".join([page.get_text() for page in doc])
 
 
 
 
 
 
25
  except Exception as e:
26
  return f"❌ PDF extraction error: {e}"
27
 
@@ -46,7 +54,7 @@ def extract_text_from_pptx(file_bytes):
46
 
47
  def extract_text_from_xlsx(file_bytes):
48
  try:
49
- wb = openpyxl.load_workbook(io.BytesIO(file_bytes))
50
  text = []
51
  for sheet in wb.sheetnames:
52
  ws = wb[sheet]
@@ -56,44 +64,7 @@ def extract_text_from_xlsx(file_bytes):
56
  return "\n".join(text)
57
  except Exception as e:
58
  return f"❌ XLSX extraction error: {e}"
59
- """
60
- def extract_text_from_pdf(pdf_file):
61
- text = []
62
- try:
63
- with fitz.open(pdf_file) as doc:
64
- for page in doc:
65
- text.append(page.get_text("text"))
66
- except Exception as e:
67
- return f"Error reading PDF: {e}"
68
- return "\n".join(text)
69
-
70
- def extract_text_from_docx(docx_file):
71
- doc = docx.Document(docx_file)
72
- return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
73
-
74
- def extract_text_from_pptx(pptx_file):
75
- text = []
76
- try:
77
- presentation = pptx.Presentation(pptx_file)
78
- for slide in presentation.slides:
79
- for shape in slide.shapes:
80
- if hasattr(shape, "text"):
81
- text.append(shape.text)
82
- except Exception as e:
83
- return f"Error reading PPTX: {e}"
84
- return "\n".join(text)
85
 
86
- def extract_text_from_xlsx(xlsx_file):
87
- text = []
88
- try:
89
- wb = openpyxl.load_workbook(xlsx_file)
90
- for sheet in wb.sheetnames:
91
- ws = wb[sheet]
92
- for row in ws.iter_rows(values_only=True):
93
- text.append(" ".join(str(cell) for cell in row if cell))
94
- except Exception as e:
95
- return f"Error reading XLSX: {e}"
96
- return "\n".join(text)
97
  # -------------------------
98
  # Main Logic
99
  # -------------------------
@@ -112,7 +83,7 @@ def summarize_document(file):
112
  else:
113
  return "❌ Unsupported file format."
114
 
115
- if not text.strip():
116
  return "❗ No extractable text found."
117
 
118
  try:
 
5
  import openpyxl
6
  import pptx
7
  import io
8
+ import os
9
+ import tempfile
10
  from PIL import Image
11
  import gradio as gr
12
  from transformers import pipeline
 
20
  # -------------------------
21
  # Extraction Functions
22
  # -------------------------
23
+ def extract_text_from_pdf(file_bytes):
24
  try:
25
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
26
+ tmp.write(file_bytes)
27
+ tmp_path = tmp.name
28
+
29
+ with fitz.open(tmp_path) as doc:
30
+ text = "\n".join(page.get_text() for page in doc)
31
+ os.unlink(tmp_path)
32
+ return text
33
  except Exception as e:
34
  return f"❌ PDF extraction error: {e}"
35
 
 
54
 
55
  def extract_text_from_xlsx(file_bytes):
56
  try:
57
+ wb = openpyxl.load_workbook(io.BytesIO(file_bytes), read_only=True, data_only=True)
58
  text = []
59
  for sheet in wb.sheetnames:
60
  ws = wb[sheet]
 
64
  return "\n".join(text)
65
  except Exception as e:
66
  return f"❌ XLSX extraction error: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
 
 
 
 
 
 
 
 
 
 
 
68
  # -------------------------
69
  # Main Logic
70
  # -------------------------
 
83
  else:
84
  return "❌ Unsupported file format."
85
 
86
+ if not text or not text.strip():
87
  return "❗ No extractable text found."
88
 
89
  try: