KingNish commited on
Commit
9686871
·
verified ·
1 Parent(s): a007d1e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -9
app.py CHANGED
@@ -69,6 +69,29 @@ def extract_text_from_docx(docx_data, clean=True):
69
  text = clean_text(text)
70
  return text, len(text)
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  def read_document(file, clean=True):
74
  """Reads content from various document formats."""
@@ -107,15 +130,7 @@ def read_document(file, clean=True):
107
 
108
  elif file_extension == 'pptx':
109
  try:
110
- presentation = Presentation(io.BytesIO(file_content))
111
- content = ''
112
- for slide in presentation.slides:
113
- for shape in slide.shapes:
114
- if hasattr(shape, "text"):
115
- content += shape.text + ' '
116
- if clean:
117
- content = clean_text(content)
118
- return content, len(content)
119
  except Exception as e:
120
  return f"Error reading PPTX: {e}", 0
121
 
 
69
  text = clean_text(text)
70
  return text, len(text)
71
 
72
+ def extract_text_from_pptx(pptx_data, clean=True):
73
+ """Extracts text from PPT files."""
74
+ text = u''
75
+ zipf = zipfile.ZipFile(io.BytesIO(docx_data))
76
+
77
+ filelist = zipf.namelist()
78
+
79
+ # Extract text from slide notes
80
+ notes_xmls = 'ppt/notesSlides/notesSlide[0-9]*.xml'
81
+ for fname in filelist:
82
+ if re.match(notes_xmls, fname):
83
+ text += xml2text(zipf.read(fname))
84
+
85
+ # Extract text from slide content (shapes and text boxes)
86
+ slide_xmls = 'ppt/slides/slide[0-9]*.xml'
87
+ for fname in filelist:
88
+ if re.match(slide_xmls, fname):
89
+ text += xml2text(zipf.read(fname))
90
+
91
+ zipf.close()
92
+ if clean:
93
+ text = clean_text(text)
94
+ return text, len(text)
95
 
96
  def read_document(file, clean=True):
97
  """Reads content from various document formats."""
 
130
 
131
  elif file_extension == 'pptx':
132
  try:
133
+ return extract_text_from_pptx(file_content, clean)
 
 
 
 
 
 
 
 
134
  except Exception as e:
135
  return f"Error reading PPTX: {e}", 0
136