Spaces:

muryshev
/

generic-chatbot-backend

Runtime error

File size: 6,401 Bytes

57cf043

import logging
import os
from pathlib import Path

from bs4 import BeautifulSoup
from tqdm import tqdm

from components.parser.docx_to_xml import DocxToXml
from components.parser.xml.structures import ParsedXML, ParsedXMLs
from components.parser.xml.xml_info_parser import XMLInfoParser
from components.parser.xml.xml_table_parser import XMLTableParser
from components.parser.xml.xml_text_parser import XMLTextParser

logger = logging.getLogger(__name__)


class XMLParser:
    """
    Класс для парсинга xml файлов.
    """

    @classmethod
    def parse_all(
        cls,
        filepath: os.PathLike,
        encoding: str = 'cp866',
        include_content: bool = False,
        ignore_files_contains: list[str] = [],
        use_tqdm: bool = False,
    ) -> ParsedXMLs:
        """
        Парсинг всех xml файлов в директории.

        Args:
            filepath: os.PathLike - путь к директории с xml файлами
            encoding: str - кодировка файлов
            include_content: bool - включать ли содержимое файлов в результат
            ignore_files_contains: list[str] - игнорировать файлы, содержащие эти строки в названии
            use_tqdm: bool - использовать ли прогресс-бар

        Returns:
            ParsedXMLs - данные, полученные из всех xml файлов
        """
        files = cls._get_recursive_files(filepath, ignore_files_contains)
        logger.info(f"Found {len(files)} files to parse")
        if use_tqdm:
            files = tqdm(files, desc='Парсинг файлов')

        parsed_xmls = [cls.parse(file, encoding, include_content) for file in files]
        logger.info(f"Parsed {len(parsed_xmls)} files")
        parsed_xmls = [
            xml
            for xml in parsed_xmls
            if (
                xml is not None
                and not any(
                    ignore_file in xml.name for ignore_file in ignore_files_contains
                )
            )
        ]
        logger.info(f"Filtered {len(parsed_xmls)} files")
        return ParsedXMLs(parsed_xmls)

    @classmethod
    def parse(
        cls,
        filepath: os.PathLike,
        encoding: str = 'utf-8',
        include_content: bool = False,
    ) -> ParsedXML | None:
        """
        Парсинг xml файла.

        Args:
            filepath: os.PathLike - путь к xml файлу
            encoding: str - кодировка файла
            include_content: bool - включать ли содержимое файла в результат

        Returns:
            ParsedXML - данные, полученные из xml файла
        """
        if filepath.suffix in ['.docx', '.DOCX']:
            logger.info(f"Parsing docx file {filepath}")
            try:
                xml_text = DocxToXml(filepath).extract_document_xml()
                logger.info(f"Parsed docx file {filepath}")
            except Exception as e:
                logger.error(f"Error parsing docx file {filepath}: {e}")
                return None
        else:
            with open(filepath, 'r', encoding=encoding) as file:
                xml_text = file.read()
                
        soup = BeautifulSoup(xml_text, features='xml')

        # Создаем парсер информации и получаем базовые данные
        info_parser = XMLInfoParser(soup, filepath)
        parsed_xml = info_parser.parse()
        logger.debug(f"Parsed info for {filepath}")
        
        if not parsed_xml:
            logger.warning(f"Failed to parse info for {filepath}")
            return None

        if not include_content:
            logger.debug(f"Skipping content for {filepath}")
            return parsed_xml

        # Парсим таблицы и текст, сохраняя структурированные данные
        table_parser = XMLTableParser(soup)
        text_parser = XMLTextParser(soup)
        
        # Сохраняем структурированные данные вместо текста
        parsed_xml.tables = table_parser.parse()
        logger.debug(f"Parsed table content for {filepath}")
        
        parsed_xml.text = text_parser.parse()
        logger.debug(f"Parsed text content for {filepath}")
        
        # Собираем аббревиатуры из таблиц и текста
        abbreviations = []
        
        # Получаем аббревиатуры из таблиц
        table_abbreviations = table_parser.get_abbreviations()
        if table_abbreviations:
            logger.debug(f"Got {len(table_abbreviations)} abbreviations from tables")
            abbreviations.extend(table_abbreviations)
            
        # Получаем аббревиатуры из текста
        text_abbreviations = text_parser.get_abbreviations()
        if text_abbreviations:
            logger.debug(f"Got {len(text_abbreviations)} abbreviations from text")
            abbreviations.extend(text_abbreviations)
            
        # Сохраняем все аббревиатуры в ParsedXML
        if abbreviations:
            logger.info(f"Total abbreviations extracted: {len(abbreviations)}")
            parsed_xml.abbreviations = abbreviations
            
            # Применяем аббревиатуры к содержимому документа
            parsed_xml.apply_document_abbreviations()
            logger.debug(f"Applied abbreviations to document content")
        
        return parsed_xml

    @classmethod
    def _get_recursive_files(
        cls,
        path_to_dir: os.PathLike,
        ignore_files_contains: list[str] = [],
    ) -> list[os.PathLike]:
        """
        Получение всех xml файлов в директории любой вложенности.
        """
        path_to_dir = Path(path_to_dir)
        relative_paths = [
            path.relative_to(path_to_dir)
            for path in path_to_dir.glob('**/*.xml')
            if not any(
                ignore_file in path.name for ignore_file in ignore_files_contains
            )
        ]
        return [Path(path_to_dir) / path for path in relative_paths]