Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -69,6 +69,29 @@ def extract_text_from_docx(docx_data, clean=True):
|
|
| 69 |
text = clean_text(text)
|
| 70 |
return text, len(text)
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
def read_document(file, clean=True):
|
| 74 |
"""Reads content from various document formats."""
|
|
@@ -107,15 +130,7 @@ def read_document(file, clean=True):
|
|
| 107 |
|
| 108 |
elif file_extension == 'pptx':
|
| 109 |
try:
|
| 110 |
-
|
| 111 |
-
content = ''
|
| 112 |
-
for slide in presentation.slides:
|
| 113 |
-
for shape in slide.shapes:
|
| 114 |
-
if hasattr(shape, "text"):
|
| 115 |
-
content += shape.text + ' '
|
| 116 |
-
if clean:
|
| 117 |
-
content = clean_text(content)
|
| 118 |
-
return content, len(content)
|
| 119 |
except Exception as e:
|
| 120 |
return f"Error reading PPTX: {e}", 0
|
| 121 |
|
|
|
|
| 69 |
text = clean_text(text)
|
| 70 |
return text, len(text)
|
| 71 |
|
| 72 |
+
def extract_text_from_pptx(pptx_data, clean=True):
|
| 73 |
+
"""Extracts text from PPT files."""
|
| 74 |
+
text = u''
|
| 75 |
+
zipf = zipfile.ZipFile(io.BytesIO(docx_data))
|
| 76 |
+
|
| 77 |
+
filelist = zipf.namelist()
|
| 78 |
+
|
| 79 |
+
# Extract text from slide notes
|
| 80 |
+
notes_xmls = 'ppt/notesSlides/notesSlide[0-9]*.xml'
|
| 81 |
+
for fname in filelist:
|
| 82 |
+
if re.match(notes_xmls, fname):
|
| 83 |
+
text += xml2text(zipf.read(fname))
|
| 84 |
+
|
| 85 |
+
# Extract text from slide content (shapes and text boxes)
|
| 86 |
+
slide_xmls = 'ppt/slides/slide[0-9]*.xml'
|
| 87 |
+
for fname in filelist:
|
| 88 |
+
if re.match(slide_xmls, fname):
|
| 89 |
+
text += xml2text(zipf.read(fname))
|
| 90 |
+
|
| 91 |
+
zipf.close()
|
| 92 |
+
if clean:
|
| 93 |
+
text = clean_text(text)
|
| 94 |
+
return text, len(text)
|
| 95 |
|
| 96 |
def read_document(file, clean=True):
|
| 97 |
"""Reads content from various document formats."""
|
|
|
|
| 130 |
|
| 131 |
elif file_extension == 'pptx':
|
| 132 |
try:
|
| 133 |
+
return extract_text_from_pptx(file_content, clean)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
except Exception as e:
|
| 135 |
return f"Error reading PPTX: {e}", 0
|
| 136 |
|