Spaces:

muryshev
/

generic-chatbot-backend

Runtime error

App Files Files Community

generic-chatbot-backend / components /services /files.py

muryshev

init

57cf043 8 months ago

raw

history blame

7.22 kB

	import logging
	import os
	import re
	import subprocess
	from pathlib import Path

	from docx import Document
	from docx.oxml import parse_xml
	from docx.oxml.ns import qn
	from fastapi import HTTPException

	START_SPECIALS = re.escape(r'<w:t>$$$$$SVB396</w:t></w:r></w:p><w:p><w:pPr><w:ind w:first-line="0"/><w:jc w:val="left"/></w:pPr><w:r>')
	END_SPECIALS = re.escape(r'<w:t>$$18 Текст</w:t></w:r></w:p><w:p><w:pPr><w:ind w:first-line="0"/></w:pPr>')


	class FileService:
	def __init__(self):
	self.documents_path = Path(os.environ.get('DOCUMENTS_PATH', '/app/data/xmls_processed'))
	self.source_path = Path(os.environ.get('SOURCE_PATH', '/app/data/SEND'))

	def prepare_file(self, filename: str) -> Path:
	"""
	Получает содержимое xml-файла.
	Если он не обработан, файл берётся из директории SOURCE_PATH, обрабатывается и сохраняется в директорию DOCUMENTS_PATH.

	Args:
	filename (str): Имя файла

	Returns:
	Path: Путь к файлу
	"""
	file_path = self.documents_path / filename

	if not file_path.exists():
	source_file_path = self.source_path / filename

	logging.info(f"Process file: {source_file_path}")

	if (not source_file_path.exists()) or (not source_file_path.is_file()):
	logging.error(f"File not found: {source_file_path}")
	logging.error(f"Directory: {self.source_path} exists: {self.source_path.exists()}")
	raise HTTPException(status_code=404, detail="File not found")

	with open(source_file_path, "r", encoding="utf-8") as source_file:
	file_content = source_file.read()

	file_content = self._prettify_xml(file_content)

	file_path.parent.mkdir(parents=True, exist_ok=True)

	with open(file_path, "w", encoding="utf-8") as file:
	file.write(file_content)

	logging.info(f"File saved: {file_path}")

	return file_path

	def prepare_pdf(self, filename: str) -> Path:
	"""
	Получает содержимое docx-файла.
	"""
	prepared_file = self.prepare_file(filename)
	docx_path = prepared_file.with_suffix('.docx')
	pdf_path = prepared_file.with_suffix('.pdf')
	if not pdf_path.exists():
	if self._convert_to_docx(prepared_file) != 0:
	raise HTTPException(status_code=400, detail="Failed to convert xml to docx")

	self._fix_style_table_docx(docx_path)

	if self._convert_to_pdf(docx_path) != 0:
	raise HTTPException(status_code=400, detail="Failed to convert docx to pdf")

	docx_path.unlink()

	return pdf_path

	@staticmethod
	def _prettify_xml(file_content: str) -> str:
	"""
	Удаляет спецсимволы из начала xml файла, чтобы документ смотрелся красиво.

	Args:
	file_content (str): Содержимое xml файла

	Returns:
	str: Содержимое xml файла без спецсимволов
	"""
	start = re.search(START_SPECIALS, file_content)
	end = re.search(END_SPECIALS, file_content)

	if start and end:
	return file_content[:start.start()] + file_content[end.end() + 1:]

	return file_content

	def _fix_style_table_docx(self, file_path: Path) -> None:
	"""
	Исправляет отображение таблиц и удаляет спецсимволы.
	Args:
	filename (str): Название docx файла.
	"""
	source_doc = Document(str(file_path))
	output_doc = Document()

	for block in source_doc.element.body:
	if block.tag.endswith('p'):
	clear_text = self._remove_curly_braces_content(block.text).replace('w:r>', '')
	clear_text = clear_text.replace('См. документ в MS-Word', '')
	output_doc.add_paragraph(clear_text)

	elif block.tag.endswith('tbl'):
	old_table = parse_xml(block.xml)
	old_rows = old_table.findall(qn('w:tr'))
	len_old_rows = len(old_rows)
	arr_lens_cell_in_rows = [len(row.findall(qn('w:tc'))) for row in old_rows]

	table = output_doc.add_table(rows=len_old_rows, cols=max(arr_lens_cell_in_rows))
	table.style = 'TableGrid'
	table.autofit = False

	for ind_old_row, old_row in enumerate(old_rows):
	old_cells = old_row.findall(qn('w:tc'))
	for ind_old_cell, old_cell in enumerate(old_cells):
	texts = old_cell.findall(f".//{qn('w:t')}")
	for text in texts:
	if '{' in text.text:
	continue
	else:
	try:
	table.rows[ind_old_row].cells[ind_old_cell].text = (
	table.rows[ind_old_row].cells[ind_old_cell].text + text.text
	)
	except IndexError:
	logging.warning('Ошибка в индексе, таблица не правильной формы')
	continue

	output_doc.save(str(file_path))

	@staticmethod
	def _remove_curly_braces_content(text: str) -> str:
	"""Удаляет все содержимое внутри фигурных скобок, включая сами скобки."""
	return re.sub(r'\{[^{}]*\}', '', text)

	def _convert_to_pdf(
	self,
	file_path: Path,
	) -> int:
	"""
	Конвертирует docx-файл в pdf.

	Returns:
	int: Код выхода. 0 - если конвертация прошла успешно.
	"""
	directory = str(file_path.parent)
	path = str(file_path)
	command = ['libreoffice', '--headless', '--convert-to', 'pdf', '--outdir', directory, path]
	running = subprocess.Popen(command)
	return running.wait()


	def _convert_to_docx(
	self,
	file_path: Path,
	) -> int:
	"""
	Конвертирует xml-файл в docx.

	Returns:
	int: Код выхода. 0 - если конвертация прошла успешно.
	"""
	directory = str(file_path.parent)
	path = str(file_path)
	command = ['libreoffice', '--headless', '--convert-to', 'docx', '--outdir', directory, path]
	running = subprocess.Popen(command)
	return running.wait()