VocRT / providers /ppt_and_docx_helper.py
Anurag
version-2 initial version
5306da4
from io import BytesIO
from docx import Document as DocxDocument
from pptx import Presentation
def extract_text_from_docx(content_bytes: bytes) -> str:
doc = DocxDocument(BytesIO(content_bytes))
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
return "\n".join(paragraphs)
def extract_text_from_pptx(content_bytes: bytes) -> str:
prs = Presentation(BytesIO(content_bytes))
texts = []
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
texts.append(shape.text)
return "\n".join(texts)