import logging import zipfile from pathlib import Path logger = logging.getLogger(__name__) class DocxToXml: def __init__(self, docx_path: str): """ Initialize the converter with path to DOCX file Args: docx_path (str): Path to the DOCX file """ self.docx_path = Path(docx_path) if not self.docx_path.exists(): raise FileNotFoundError(f"File not found: {docx_path}") def extract_document_xml(self) -> str: """ Extract document.xml content from the DOCX file Returns: str: Content of document.xml file Raises: ValueError: If document.xml is not found in the DOCX file """ try: with zipfile.ZipFile(self.docx_path) as docx_zip: # The main document content is always stored in word/document.xml xml_content = docx_zip.read('word/document.xml') return xml_content.decode('utf-8') except KeyError: raise ValueError("document.xml not found in the DOCX file") except Exception as e: raise Exception(f"Error extracting XML: {str(e)}") @staticmethod def convert_file(docx_path: str) -> str: """ Static method to quickly convert a DOCX file to XML Args: docx_path (str): Path to the DOCX file Returns: str: Content of document.xml file """ converter = DocxToXml(docx_path) return converter.extract_document_xml()