Spaces:

muryshev
/

generic-chatbot-backend

Sleeping

App Files Files Community

generic-chatbot-backend / components /parser /xml /xml_parser.py

muryshev

init

57cf043 3 months ago

raw

history blame

6.4 kB

	import logging
	import os
	from pathlib import Path

	from bs4 import BeautifulSoup
	from tqdm import tqdm

	from components.parser.docx_to_xml import DocxToXml
	from components.parser.xml.structures import ParsedXML, ParsedXMLs
	from components.parser.xml.xml_info_parser import XMLInfoParser
	from components.parser.xml.xml_table_parser import XMLTableParser
	from components.parser.xml.xml_text_parser import XMLTextParser

	logger = logging.getLogger(__name__)


	class XMLParser:
	"""
	Класс для парсинга xml файлов.
	"""

	@classmethod
	def parse_all(
	cls,
	filepath: os.PathLike,
	encoding: str = 'cp866',
	include_content: bool = False,
	ignore_files_contains: list[str] = [],
	use_tqdm: bool = False,
	) -> ParsedXMLs:
	"""
	Парсинг всех xml файлов в директории.

	Args:
	filepath: os.PathLike - путь к директории с xml файлами
	encoding: str - кодировка файлов
	include_content: bool - включать ли содержимое файлов в результат
	ignore_files_contains: list[str] - игнорировать файлы, содержащие эти строки в названии
	use_tqdm: bool - использовать ли прогресс-бар

	Returns:
	ParsedXMLs - данные, полученные из всех xml файлов
	"""
	files = cls._get_recursive_files(filepath, ignore_files_contains)
	logger.info(f"Found {len(files)} files to parse")
	if use_tqdm:
	files = tqdm(files, desc='Парсинг файлов')

	parsed_xmls = [cls.parse(file, encoding, include_content) for file in files]
	logger.info(f"Parsed {len(parsed_xmls)} files")
	parsed_xmls = [
	xml
	for xml in parsed_xmls
	if (
	xml is not None
	and not any(
	ignore_file in xml.name for ignore_file in ignore_files_contains
	)
	)
	]
	logger.info(f"Filtered {len(parsed_xmls)} files")
	return ParsedXMLs(parsed_xmls)

	@classmethod
	def parse(
	cls,
	filepath: os.PathLike,
	encoding: str = 'utf-8',
	include_content: bool = False,
	) -> ParsedXML \| None:
	"""
	Парсинг xml файла.

	Args:
	filepath: os.PathLike - путь к xml файлу
	encoding: str - кодировка файла
	include_content: bool - включать ли содержимое файла в результат

	Returns:
	ParsedXML - данные, полученные из xml файла
	"""
	if filepath.suffix in ['.docx', '.DOCX']:
	logger.info(f"Parsing docx file {filepath}")
	try:
	xml_text = DocxToXml(filepath).extract_document_xml()
	logger.info(f"Parsed docx file {filepath}")
	except Exception as e:
	logger.error(f"Error parsing docx file {filepath}: {e}")
	return None
	else:
	with open(filepath, 'r', encoding=encoding) as file:
	xml_text = file.read()

	soup = BeautifulSoup(xml_text, features='xml')

	# Создаем парсер информации и получаем базовые данные
	info_parser = XMLInfoParser(soup, filepath)
	parsed_xml = info_parser.parse()
	logger.debug(f"Parsed info for {filepath}")

	if not parsed_xml:
	logger.warning(f"Failed to parse info for {filepath}")
	return None

	if not include_content:
	logger.debug(f"Skipping content for {filepath}")
	return parsed_xml

	# Парсим таблицы и текст, сохраняя структурированные данные
	table_parser = XMLTableParser(soup)
	text_parser = XMLTextParser(soup)

	# Сохраняем структурированные данные вместо текста
	parsed_xml.tables = table_parser.parse()
	logger.debug(f"Parsed table content for {filepath}")

	parsed_xml.text = text_parser.parse()
	logger.debug(f"Parsed text content for {filepath}")

	# Собираем аббревиатуры из таблиц и текста
	abbreviations = []

	# Получаем аббревиатуры из таблиц
	table_abbreviations = table_parser.get_abbreviations()
	if table_abbreviations:
	logger.debug(f"Got {len(table_abbreviations)} abbreviations from tables")
	abbreviations.extend(table_abbreviations)

	# Получаем аббревиатуры из текста
	text_abbreviations = text_parser.get_abbreviations()
	if text_abbreviations:
	logger.debug(f"Got {len(text_abbreviations)} abbreviations from text")
	abbreviations.extend(text_abbreviations)

	# Сохраняем все аббревиатуры в ParsedXML
	if abbreviations:
	logger.info(f"Total abbreviations extracted: {len(abbreviations)}")
	parsed_xml.abbreviations = abbreviations

	# Применяем аббревиатуры к содержимому документа
	parsed_xml.apply_document_abbreviations()
	logger.debug(f"Applied abbreviations to document content")

	return parsed_xml

	@classmethod
	def _get_recursive_files(
	cls,
	path_to_dir: os.PathLike,
	ignore_files_contains: list[str] = [],
	) -> list[os.PathLike]:
	"""
	Получение всех xml файлов в директории любой вложенности.
	"""
	path_to_dir = Path(path_to_dir)
	relative_paths = [
	path.relative_to(path_to_dir)
	for path in path_to_dir.glob('*/.xml')
	if not any(
	ignore_file in path.name for ignore_file in ignore_files_contains
	)
	]
	return [Path(path_to_dir) / path for path in relative_paths]