from io import BytesIO | |
from docx import Document as DocxDocument | |
from pptx import Presentation | |
def extract_text_from_docx(content_bytes: bytes) -> str: | |
doc = DocxDocument(BytesIO(content_bytes)) | |
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] | |
return "\n".join(paragraphs) | |
def extract_text_from_pptx(content_bytes: bytes) -> str: | |
prs = Presentation(BytesIO(content_bytes)) | |
texts = [] | |
for slide in prs.slides: | |
for shape in slide.shapes: | |
if hasattr(shape, "text") and shape.text.strip(): | |
texts.append(shape.text) | |
return "\n".join(texts) | |