File size: 7,219 Bytes

57cf043

import logging
import os
import re
import subprocess
from pathlib import Path

from docx import Document
from docx.oxml import parse_xml
from docx.oxml.ns import qn
from fastapi import HTTPException

START_SPECIALS = re.escape(r'<w:t>$$$$$SVB396</w:t></w:r></w:p><w:p><w:pPr><w:ind w:first-line="0"/><w:jc w:val="left"/></w:pPr><w:r>')
END_SPECIALS = re.escape(r'<w:t>$$18 Текст</w:t></w:r></w:p><w:p><w:pPr><w:ind w:first-line="0"/></w:pPr>')


class FileService:
    def __init__(self):
        self.documents_path = Path(os.environ.get('DOCUMENTS_PATH', '/app/data/xmls_processed'))
        self.source_path = Path(os.environ.get('SOURCE_PATH', '/app/data/SEND'))

    def prepare_file(self, filename: str) -> Path:
        """
        Получает содержимое xml-файла. 
        Если он не обработан, файл берётся из директории SOURCE_PATH, обрабатывается и сохраняется в директорию DOCUMENTS_PATH.
        
        Args:
            filename (str): Имя файла
        
        Returns:
            Path: Путь к файлу
        """
        file_path = self.documents_path / filename
        
        if not file_path.exists():
            source_file_path = self.source_path / filename
            
            logging.info(f"Process file: {source_file_path}")
            
            if (not source_file_path.exists()) or (not source_file_path.is_file()):
                logging.error(f"File not found: {source_file_path}")
                logging.error(f"Directory: {self.source_path} exists: {self.source_path.exists()}")
                raise HTTPException(status_code=404, detail="File not found")
                        
            with open(source_file_path, "r", encoding="utf-8") as source_file:
                file_content = source_file.read()
            
            file_content = self._prettify_xml(file_content)
            
            file_path.parent.mkdir(parents=True, exist_ok=True)
            
            with open(file_path, "w", encoding="utf-8") as file:
                file.write(file_content)
                
            logging.info(f"File saved: {file_path}")
                
        return file_path
    
    def prepare_pdf(self, filename: str) -> Path:
        """
        Получает содержимое docx-файла.
        """
        prepared_file = self.prepare_file(filename)
        docx_path = prepared_file.with_suffix('.docx')
        pdf_path = prepared_file.with_suffix('.pdf')
        if not pdf_path.exists():
            if self._convert_to_docx(prepared_file) != 0:
                raise HTTPException(status_code=400, detail="Failed to convert xml to docx")
            
            self._fix_style_table_docx(docx_path)
            
            if self._convert_to_pdf(docx_path) != 0:
                raise HTTPException(status_code=400, detail="Failed to convert docx to pdf")
            
            docx_path.unlink()
            
        return pdf_path

    @staticmethod
    def _prettify_xml(file_content: str) -> str:
        """
        Удаляет спецсимволы из начала xml файла, чтобы документ смотрелся красиво.
        
        Args:
            file_content (str): Содержимое xml файла
        
        Returns:
            str: Содержимое xml файла без спецсимволов
        """
        start = re.search(START_SPECIALS, file_content)
        end = re.search(END_SPECIALS, file_content)
        
        if start and end:
            return file_content[:start.start()] + file_content[end.end() + 1:]
        
        return file_content

    def _fix_style_table_docx(self, file_path: Path) -> None:
        """
        Исправляет отображение таблиц и удаляет спецсимволы.
        Args:
            filename (str): Название docx файла.
        """
        source_doc = Document(str(file_path))
        output_doc = Document()

        for block in source_doc.element.body:
            if block.tag.endswith('p'):
                clear_text = self._remove_curly_braces_content(block.text).replace('w:r>', '')
                clear_text = clear_text.replace('См. документ в  MS-Word', '')
                output_doc.add_paragraph(clear_text)

            elif block.tag.endswith('tbl'):
                old_table = parse_xml(block.xml)
                old_rows = old_table.findall(qn('w:tr'))
                len_old_rows = len(old_rows)
                arr_lens_cell_in_rows = [len(row.findall(qn('w:tc'))) for row in old_rows]

                table = output_doc.add_table(rows=len_old_rows, cols=max(arr_lens_cell_in_rows))
                table.style = 'TableGrid'
                table.autofit = False

                for ind_old_row, old_row in enumerate(old_rows):
                    old_cells = old_row.findall(qn('w:tc'))
                    for ind_old_cell, old_cell in enumerate(old_cells):
                        texts = old_cell.findall(f".//{qn('w:t')}")
                        for text in texts:
                            if '{' in text.text:
                                continue
                            else:
                                try:
                                    table.rows[ind_old_row].cells[ind_old_cell].text = (
                                        table.rows[ind_old_row].cells[ind_old_cell].text + text.text
                                    )
                                except IndexError:
                                    logging.warning('Ошибка в индексе, таблица не правильной формы')
                                    continue

        output_doc.save(str(file_path))
        
    @staticmethod
    def _remove_curly_braces_content(text: str) -> str:
        """Удаляет все содержимое внутри фигурных скобок, включая сами скобки."""
        return re.sub(r'\{[^{}]*\}', '', text)
    
    def _convert_to_pdf(
        self, 
        file_path: Path,
    ) -> int:
        """
        Конвертирует docx-файл в pdf.
        
        Returns:
            int: Код выхода. 0 - если конвертация прошла успешно.
        """
        directory = str(file_path.parent)
        path = str(file_path)
        command = ['libreoffice', '--headless', '--convert-to', 'pdf', '--outdir', directory, path]
        running = subprocess.Popen(command)
        return running.wait()
        
        
    def _convert_to_docx(
        self, 
        file_path: Path,
    ) -> int:
        """
        Конвертирует xml-файл в docx.
        
        Returns:
            int: Код выхода. 0 - если конвертация прошла успешно.
        """
        directory = str(file_path.parent)
        path = str(file_path)
        command = ['libreoffice', '--headless', '--convert-to', 'docx', '--outdir', directory, path]
        running = subprocess.Popen(command)
        return running.wait()