anuragsingh922
/

VocRT

Model card Files Files and versions Community

VocRT / providers /ppt_and_docx_helper.py

Anurag

version-2 initial version

5306da4 7 days ago

history blame contribute delete

620 Bytes

	from io import BytesIO
	from docx import Document as DocxDocument
	from pptx import Presentation



	def extract_text_from_docx(content_bytes: bytes) -> str:
	doc = DocxDocument(BytesIO(content_bytes))
	paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
	return "\n".join(paragraphs)



	def extract_text_from_pptx(content_bytes: bytes) -> str:
	prs = Presentation(BytesIO(content_bytes))
	texts = []
	for slide in prs.slides:
	for shape in slide.shapes:
	if hasattr(shape, "text") and shape.text.strip():
	texts.append(shape.text)
	return "\n".join(texts)