Spaces:

KingNish
/

Doc-Reader-and-Chat

Running

KingNish commited on Sep 19, 2024

Commit

9686871

verified ·

1 Parent(s): a007d1e

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -69,6 +69,29 @@ def extract_text_from_docx(docx_data, clean=True):
         text = clean_text(text)
     return text, len(text)
 def read_document(file, clean=True):
     """Reads content from various document formats."""
@@ -107,15 +130,7 @@ def read_document(file, clean=True):
     elif file_extension == 'pptx':
         try:
-            presentation = Presentation(io.BytesIO(file_content))
-            content = ''
-            for slide in presentation.slides:
-                for shape in slide.shapes:
-                    if hasattr(shape, "text"):
-                        content += shape.text + ' '
-            if clean:
-                content = clean_text(content)
-            return content, len(content)
         except Exception as e:
             return f"Error reading PPTX: {e}", 0

         text = clean_text(text)
     return text, len(text)
+def extract_text_from_pptx(pptx_data, clean=True):
+    """Extracts text from PPT files."""
+    text = u''
+    zipf = zipfile.ZipFile(io.BytesIO(docx_data))
+    filelist = zipf.namelist()
+    # Extract text from slide notes
+    notes_xmls = 'ppt/notesSlides/notesSlide[0-9]*.xml'
+    for fname in filelist:
+        if re.match(notes_xmls, fname):
+            text += xml2text(zipf.read(fname))
+    # Extract text from slide content (shapes and text boxes)
+    slide_xmls = 'ppt/slides/slide[0-9]*.xml'
+    for fname in filelist:
+        if re.match(slide_xmls, fname):
+            text += xml2text(zipf.read(fname))
+    zipf.close()
+    if clean:
+        text = clean_text(text)
+    return text, len(text)
 def read_document(file, clean=True):
     """Reads content from various document formats."""
     elif file_extension == 'pptx':
         try:
+            return extract_text_from_pptx(file_content, clean)
         except Exception as e:
             return f"Error reading PPTX: {e}", 0