Spaces:
Runtime error
Runtime error
| import logging | |
| import os | |
| import re | |
| import subprocess | |
| from pathlib import Path | |
| from docx import Document | |
| from docx.oxml import parse_xml | |
| from docx.oxml.ns import qn | |
| from fastapi import HTTPException | |
| START_SPECIALS = re.escape(r'<w:t>$$$$$SVB396</w:t></w:r></w:p><w:p><w:pPr><w:ind w:first-line="0"/><w:jc w:val="left"/></w:pPr><w:r>') | |
| END_SPECIALS = re.escape(r'<w:t>$$18 Текст</w:t></w:r></w:p><w:p><w:pPr><w:ind w:first-line="0"/></w:pPr>') | |
| class FileService: | |
| def __init__(self): | |
| self.documents_path = Path(os.environ.get('DOCUMENTS_PATH', '/app/data/xmls_processed')) | |
| self.source_path = Path(os.environ.get('SOURCE_PATH', '/app/data/SEND')) | |
| def prepare_file(self, filename: str) -> Path: | |
| """ | |
| Получает содержимое xml-файла. | |
| Если он не обработан, файл берётся из директории SOURCE_PATH, обрабатывается и сохраняется в директорию DOCUMENTS_PATH. | |
| Args: | |
| filename (str): Имя файла | |
| Returns: | |
| Path: Путь к файлу | |
| """ | |
| file_path = self.documents_path / filename | |
| if not file_path.exists(): | |
| source_file_path = self.source_path / filename | |
| logging.info(f"Process file: {source_file_path}") | |
| if (not source_file_path.exists()) or (not source_file_path.is_file()): | |
| logging.error(f"File not found: {source_file_path}") | |
| logging.error(f"Directory: {self.source_path} exists: {self.source_path.exists()}") | |
| raise HTTPException(status_code=404, detail="File not found") | |
| with open(source_file_path, "r", encoding="utf-8") as source_file: | |
| file_content = source_file.read() | |
| file_content = self._prettify_xml(file_content) | |
| file_path.parent.mkdir(parents=True, exist_ok=True) | |
| with open(file_path, "w", encoding="utf-8") as file: | |
| file.write(file_content) | |
| logging.info(f"File saved: {file_path}") | |
| return file_path | |
| def prepare_pdf(self, filename: str) -> Path: | |
| """ | |
| Получает содержимое docx-файла. | |
| """ | |
| prepared_file = self.prepare_file(filename) | |
| docx_path = prepared_file.with_suffix('.docx') | |
| pdf_path = prepared_file.with_suffix('.pdf') | |
| if not pdf_path.exists(): | |
| if self._convert_to_docx(prepared_file) != 0: | |
| raise HTTPException(status_code=400, detail="Failed to convert xml to docx") | |
| self._fix_style_table_docx(docx_path) | |
| if self._convert_to_pdf(docx_path) != 0: | |
| raise HTTPException(status_code=400, detail="Failed to convert docx to pdf") | |
| docx_path.unlink() | |
| return pdf_path | |
| def _prettify_xml(file_content: str) -> str: | |
| """ | |
| Удаляет спецсимволы из начала xml файла, чтобы документ смотрелся красиво. | |
| Args: | |
| file_content (str): Содержимое xml файла | |
| Returns: | |
| str: Содержимое xml файла без спецсимволов | |
| """ | |
| start = re.search(START_SPECIALS, file_content) | |
| end = re.search(END_SPECIALS, file_content) | |
| if start and end: | |
| return file_content[:start.start()] + file_content[end.end() + 1:] | |
| return file_content | |
| def _fix_style_table_docx(self, file_path: Path) -> None: | |
| """ | |
| Исправляет отображение таблиц и удаляет спецсимволы. | |
| Args: | |
| filename (str): Название docx файла. | |
| """ | |
| source_doc = Document(str(file_path)) | |
| output_doc = Document() | |
| for block in source_doc.element.body: | |
| if block.tag.endswith('p'): | |
| clear_text = self._remove_curly_braces_content(block.text).replace('w:r>', '') | |
| clear_text = clear_text.replace('См. документ в MS-Word', '') | |
| output_doc.add_paragraph(clear_text) | |
| elif block.tag.endswith('tbl'): | |
| old_table = parse_xml(block.xml) | |
| old_rows = old_table.findall(qn('w:tr')) | |
| len_old_rows = len(old_rows) | |
| arr_lens_cell_in_rows = [len(row.findall(qn('w:tc'))) for row in old_rows] | |
| table = output_doc.add_table(rows=len_old_rows, cols=max(arr_lens_cell_in_rows)) | |
| table.style = 'TableGrid' | |
| table.autofit = False | |
| for ind_old_row, old_row in enumerate(old_rows): | |
| old_cells = old_row.findall(qn('w:tc')) | |
| for ind_old_cell, old_cell in enumerate(old_cells): | |
| texts = old_cell.findall(f".//{qn('w:t')}") | |
| for text in texts: | |
| if '{' in text.text: | |
| continue | |
| else: | |
| try: | |
| table.rows[ind_old_row].cells[ind_old_cell].text = ( | |
| table.rows[ind_old_row].cells[ind_old_cell].text + text.text | |
| ) | |
| except IndexError: | |
| logging.warning('Ошибка в индексе, таблица не правильной формы') | |
| continue | |
| output_doc.save(str(file_path)) | |
| def _remove_curly_braces_content(text: str) -> str: | |
| """Удаляет все содержимое внутри фигурных скобок, включая сами скобки.""" | |
| return re.sub(r'\{[^{}]*\}', '', text) | |
| def _convert_to_pdf( | |
| self, | |
| file_path: Path, | |
| ) -> int: | |
| """ | |
| Конвертирует docx-файл в pdf. | |
| Returns: | |
| int: Код выхода. 0 - если конвертация прошла успешно. | |
| """ | |
| directory = str(file_path.parent) | |
| path = str(file_path) | |
| command = ['libreoffice', '--headless', '--convert-to', 'pdf', '--outdir', directory, path] | |
| running = subprocess.Popen(command) | |
| return running.wait() | |
| def _convert_to_docx( | |
| self, | |
| file_path: Path, | |
| ) -> int: | |
| """ | |
| Конвертирует xml-файл в docx. | |
| Returns: | |
| int: Код выхода. 0 - если конвертация прошла успешно. | |
| """ | |
| directory = str(file_path.parent) | |
| path = str(file_path) | |
| command = ['libreoffice', '--headless', '--convert-to', 'docx', '--outdir', directory, path] | |
| running = subprocess.Popen(command) | |
| return running.wait() | |