from io import BytesIO from docx import Document as DocxDocument from pptx import Presentation def extract_text_from_docx(content_bytes: bytes) -> str: doc = DocxDocument(BytesIO(content_bytes)) paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] return "\n".join(paragraphs) def extract_text_from_pptx(content_bytes: bytes) -> str: prs = Presentation(BytesIO(content_bytes)) texts = [] for slide in prs.slides: for shape in slide.shapes: if hasattr(shape, "text") and shape.text.strip(): texts.append(shape.text) return "\n".join(texts)