Spaces:
Sleeping
Sleeping
import logging | |
import zipfile | |
from pathlib import Path | |
logger = logging.getLogger(__name__) | |
class DocxToXml: | |
def __init__(self, docx_path: str): | |
""" | |
Initialize the converter with path to DOCX file | |
Args: | |
docx_path (str): Path to the DOCX file | |
""" | |
self.docx_path = Path(docx_path) | |
if not self.docx_path.exists(): | |
raise FileNotFoundError(f"File not found: {docx_path}") | |
def extract_document_xml(self) -> str: | |
""" | |
Extract document.xml content from the DOCX file | |
Returns: | |
str: Content of document.xml file | |
Raises: | |
ValueError: If document.xml is not found in the DOCX file | |
""" | |
try: | |
with zipfile.ZipFile(self.docx_path) as docx_zip: | |
# The main document content is always stored in word/document.xml | |
xml_content = docx_zip.read('word/document.xml') | |
return xml_content.decode('utf-8') | |
except KeyError: | |
raise ValueError("document.xml not found in the DOCX file") | |
except Exception as e: | |
raise Exception(f"Error extracting XML: {str(e)}") | |
def convert_file(docx_path: str) -> str: | |
""" | |
Static method to quickly convert a DOCX file to XML | |
Args: | |
docx_path (str): Path to the DOCX file | |
Returns: | |
str: Content of document.xml file | |
""" | |
converter = DocxToXml(docx_path) | |
return converter.extract_document_xml() | |