Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -69,6 +69,29 @@ def extract_text_from_docx(docx_data, clean=True):
|
|
69 |
text = clean_text(text)
|
70 |
return text, len(text)
|
71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
def read_document(file, clean=True):
|
74 |
"""Reads content from various document formats."""
|
@@ -107,15 +130,7 @@ def read_document(file, clean=True):
|
|
107 |
|
108 |
elif file_extension == 'pptx':
|
109 |
try:
|
110 |
-
|
111 |
-
content = ''
|
112 |
-
for slide in presentation.slides:
|
113 |
-
for shape in slide.shapes:
|
114 |
-
if hasattr(shape, "text"):
|
115 |
-
content += shape.text + ' '
|
116 |
-
if clean:
|
117 |
-
content = clean_text(content)
|
118 |
-
return content, len(content)
|
119 |
except Exception as e:
|
120 |
return f"Error reading PPTX: {e}", 0
|
121 |
|
|
|
69 |
text = clean_text(text)
|
70 |
return text, len(text)
|
71 |
|
72 |
+
def extract_text_from_pptx(pptx_data, clean=True):
|
73 |
+
"""Extracts text from PPT files."""
|
74 |
+
text = u''
|
75 |
+
zipf = zipfile.ZipFile(io.BytesIO(docx_data))
|
76 |
+
|
77 |
+
filelist = zipf.namelist()
|
78 |
+
|
79 |
+
# Extract text from slide notes
|
80 |
+
notes_xmls = 'ppt/notesSlides/notesSlide[0-9]*.xml'
|
81 |
+
for fname in filelist:
|
82 |
+
if re.match(notes_xmls, fname):
|
83 |
+
text += xml2text(zipf.read(fname))
|
84 |
+
|
85 |
+
# Extract text from slide content (shapes and text boxes)
|
86 |
+
slide_xmls = 'ppt/slides/slide[0-9]*.xml'
|
87 |
+
for fname in filelist:
|
88 |
+
if re.match(slide_xmls, fname):
|
89 |
+
text += xml2text(zipf.read(fname))
|
90 |
+
|
91 |
+
zipf.close()
|
92 |
+
if clean:
|
93 |
+
text = clean_text(text)
|
94 |
+
return text, len(text)
|
95 |
|
96 |
def read_document(file, clean=True):
|
97 |
"""Reads content from various document formats."""
|
|
|
130 |
|
131 |
elif file_extension == 'pptx':
|
132 |
try:
|
133 |
+
return extract_text_from_pptx(file_content, clean)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
except Exception as e:
|
135 |
return f"Error reading PPTX: {e}", 0
|
136 |
|