muryshev's picture
init
57cf043
raw
history blame
1.61 kB
import logging
import zipfile
from pathlib import Path
logger = logging.getLogger(__name__)
class DocxToXml:
def __init__(self, docx_path: str):
"""
Initialize the converter with path to DOCX file
Args:
docx_path (str): Path to the DOCX file
"""
self.docx_path = Path(docx_path)
if not self.docx_path.exists():
raise FileNotFoundError(f"File not found: {docx_path}")
def extract_document_xml(self) -> str:
"""
Extract document.xml content from the DOCX file
Returns:
str: Content of document.xml file
Raises:
ValueError: If document.xml is not found in the DOCX file
"""
try:
with zipfile.ZipFile(self.docx_path) as docx_zip:
# The main document content is always stored in word/document.xml
xml_content = docx_zip.read('word/document.xml')
return xml_content.decode('utf-8')
except KeyError:
raise ValueError("document.xml not found in the DOCX file")
except Exception as e:
raise Exception(f"Error extracting XML: {str(e)}")
@staticmethod
def convert_file(docx_path: str) -> str:
"""
Static method to quickly convert a DOCX file to XML
Args:
docx_path (str): Path to the DOCX file
Returns:
str: Content of document.xml file
"""
converter = DocxToXml(docx_path)
return converter.extract_document_xml()