Spaces:
Sleeping
Sleeping
from typing import List, TypeAlias | |
import re | |
import shutil | |
import os | |
Hierarchy: TypeAlias = dict[str, str] | |
class HierarchyParser: | |
def __init__(self): | |
self.appendix = False | |
self.content_flag = False | |
self.preface = False | |
self.exclude_pattern = r'^Приложение [А-Я]' | |
self.patterns = [ | |
r'^\d+\.?\s', # Соответствует "1.", "2.", и т.д. | |
r'^\d+\.\d+\.?\s', # Соответствует "1.1.", "2.1.", и т.д. | |
r'^\d+\.\d+\.\d+\.?\s', # Соответствует "1.1.1", "2.1.1", и т.д. | |
r'^\d+\.\d+\.\d+\.\d+\.?\s', # Соответствует "1.1.1.1", "2.1.1.1", и т.д. | |
r'^\d+\.\d+\.\d+\.\d+\.\d+\.?\s', # Соответствует "1.1.1.1.1", "2.1.1.1.1", и т.д. | |
r'^\d+\.\d+\.\d+\.\d+\.\d+\.\d+\.?\s', # Соответствует "1.1.1.1.1.1", "2.1.1.1.1.1", и т.д. | |
r'^\d+\.\d+\.\d+\.\d+\.\d+\.\d+\.\d+\.?\s', # Соответствует "1.1.1.1.1.1.1", и т.д. | |
] | |
self.russian_alphabet = "АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнопрстуфхцчшщъыьэюя" | |
self.english_alphabet = "ABCDEFGHIJKabcdefghijk" | |
self.table_name = '' | |
def __init_parameters(self): | |
"""Сбросить найденные параграфы""" | |
self._hierarchy = {} | |
self.duplicate_marker_list = [] | |
self.duplicate_marker_count, self.count, self.count_appendix = 0, 1, 1 | |
self.appendix = False | |
self.letter_ = None | |
self.content_flag = False | |
self.preface = False | |
self.table_name = '' | |
def _processing_without_paragraph(self, text: str, unique_doc_number): | |
""" | |
Обрабатывает ':' если в начале нет нумерации параграфа. | |
Args: | |
text: Текст | |
""" | |
last_key = list(self._hierarchy.keys())[-1] | |
split_key = last_key.split('_') | |
if len(split_key) != 2: | |
last_lvl = split_key[1] | |
pattern_text_last = split_key[2].replace(' ', '') | |
key = f"{unique_doc_number}_{last_lvl}_{pattern_text_last}" | |
if len(split_key) == 3: | |
self._hierarchy[f'{key}_PartLevel{self.count}'] = f"{self._hierarchy[key]} {text}" | |
else: | |
ch = split_key[3] | |
self._hierarchy[f'{key}_{ch}_PartLeveL{self.count}'] = f"{self._hierarchy[key]} {text}" | |
else: | |
self._hierarchy[f'{unique_doc_number}_{text.replace("_", " ")}--DoublePoint0'] = text | |
def _processing_special_markers(self, text, pattern_text, unique_doc_number): | |
""" | |
Обрабатывает маркеры 'а)', 'б)' или '-' в строке. | |
Args: | |
text: Текст | |
pattern_text: Паттерн для парсинга и кодирования | |
""" | |
last_key = list(self._hierarchy.keys())[-1] | |
split_key = last_key.split('_') | |
if len(split_key) != 2: | |
last_lvl = split_key[1] | |
pattern_text_last = split_key[2] | |
key = f'{unique_doc_number}_{last_lvl}_{pattern_text_last}' | |
if pattern_text is not None: | |
pattern_text = pattern_text.replace(')', '') | |
self._hierarchy[key + f'_PartLevel{pattern_text[0]}'] = f'{self._hierarchy[key]} {text}' | |
else: | |
if len(split_key) == 3: | |
self._hierarchy[key + f'_PartLevel{self.count}'] = f'{self._hierarchy[key]} {text}' | |
else: | |
ch = split_key[3] | |
if ch.replace('PartLevel', '')[-1] in self.russian_alphabet or ch.replace('PartLevel', '')[-1] in self.english_alphabet: | |
if len(split_key) == 4: | |
self.count = 1 | |
self._hierarchy[key + f'_{ch}_PartLeveL{self.count}'] = f'{self._hierarchy[key]} {text}' | |
elif re.search(r"<\d>", last_key): | |
self._hierarchy[key + f'_{ch}_PartLeveL{self.count}'] = f'{self._hierarchy[f"{key}_{ch}"]} {text}' | |
elif 'Table' in ch: | |
self._hierarchy[f'{key}_PartLevel{self.count}'] = f'{self._hierarchy[key]} {text}' | |
else: | |
try: | |
if int(ch.replace('PartLevel', '')) + 1 != self.count: | |
self._hierarchy[f'{key}_PartLevel{int(ch[-1]) + 1}'] = f'{self._hierarchy[key]} {text}' | |
else: | |
self._hierarchy[f'{key}_PartLevel{self.count}'] = f'{self._hierarchy[key]} {text}' | |
except ValueError: | |
self._hierarchy[f'{key}_PartLevel{self.count}'] = f'{self._hierarchy[key]} {text}' | |
else: | |
split_key = last_key.split('^') | |
self._hierarchy[f'{split_key[0]}^PartLevel{self.count}^UniqueNumber{self.unique_count}'] = f"{self._hierarchy[f'{split_key[0]}']} {text}" | |
def _processing_appendix(self, text, level, pattern_text, unique_doc_number): | |
""" | |
Обрабатывает маркеры 'Приложение' и все, что находится внутри приложения | |
Args: | |
text: Текст | |
level: Уровень параграфа | |
pattern_text: Паттерн для парсинга и кодирования | |
""" | |
if pattern_text is not None: | |
pattern_text = pattern_text.replace(')', '') | |
if level == 13: | |
self.letter_ = pattern_text[-1] | |
self.count = 1 | |
self.count_appendix = 1 | |
if level == 13 or level is None and pattern_text is None: | |
if level is None: | |
last_key = list(self._hierarchy.keys())[-1] | |
split_key = last_key.split('_') | |
if len(split_key) == 4: | |
self._hierarchy[f'{last_key}_PartLevel{self.count_appendix}'] = text | |
self.count_appendix += 1 | |
elif len(split_key) == 5: | |
self._hierarchy[f'{"_".join(split_key[:-1])}_PartLevel{self.count_appendix}'] = text | |
self.count_appendix += 1 | |
else: | |
self._hierarchy[f'{unique_doc_number}_Приложение{self.letter_}{self.count}'] = text | |
self.count_appendix = 1 | |
else: | |
self._hierarchy[f'{unique_doc_number}_Приложение{self.letter_}{self.count}'] = text | |
self.count_appendix = 1 | |
self.count += 1 | |
elif level is not None and pattern_text is not None and level != 11: | |
key = f'{unique_doc_number}_Приложение{self.letter_}{self.count}_Level{level}_PatternText{pattern_text}' | |
self._hierarchy[key] = text | |
self.count += 1 | |
self.count_appendix = 1 | |
elif level == 10: | |
last_key = list(self._hierarchy.keys())[-1] | |
split_key = last_key.split('_') | |
key = f'{last_key}_PartLevel{self.count_appendix}' | |
if len(split_key) != 2: | |
self._hierarchy[key] = f"{self._hierarchy[last_key]} {text}" | |
else: | |
self._hierarchy[key] = text | |
self.count_appendix += 1 | |
elif level == 11: | |
last_key = list(self._hierarchy.keys())[-1] | |
split_key = last_key.split('_') | |
appendix = split_key[1] | |
if len(split_key) == 2: | |
key = f'{last_key}_PartLevel{self.count_appendix}' | |
self._hierarchy[key] = f"{self._hierarchy[last_key]} {text}" | |
elif len(split_key) != 3: | |
last_lvl = split_key[2] | |
pattern_text_last = split_key[3] | |
key = f'{unique_doc_number}_{appendix}_{last_lvl}_{pattern_text_last}' | |
self._hierarchy[f'{key}_PartLevel{self.count_appendix}'] = f"{self._hierarchy[key]} {text}" | |
else: | |
key = f'{unique_doc_number}_{appendix}' | |
try: | |
self._hierarchy[f'{key}_PartLevel{self.count_appendix}'] = f"{self._hierarchy[f'{key}_PartLevel1']} {text}" | |
except KeyError: | |
print('asdfasdf') | |
self._hierarchy[f'{key}_PartLevel{self.count_appendix}'] = text | |
self.count_appendix += 1 | |
def _get_pattern(self, text): | |
""" | |
Метод находит паттерны в документе для дальнейшей обработки в соответствии с паттерном | |
Args: | |
text: Текст. | |
Returns: | |
Код паттерна или None, Паттерн или None | |
Notes: | |
0-7 это разделы 1, 1.1., 1.1.1. и т.д | |
10 это паттерн для поиска ':' в конце предложения | |
11 это паттерн для поиска а), б) или строк начинающихся с '-', '' | |
12 это паттерн для поиска наименования таблиц 'Таблица 1', 'Таблица 2' в начале строки | |
13 это паттерн для поиска 'Приложение А', 'Приложение Б' в начале строки | |
14 это паттерн для поиска <1>, <2> в начале строки | |
""" | |
for i, pattern in enumerate(self.patterns): | |
pattern = re.match(pattern, text) | |
if pattern: | |
self.preface = False | |
return i, pattern.group(0).replace(' ', '').replace('\xa0', '') | |
if re.match(r'^- ', text) or re.match(r'^– ', text): | |
return 11, None | |
if re.match(r'^ ', text): | |
return 11, None | |
if re.match(r'\d\)', text): | |
return 11, None | |
if re.match(r'\w\)', text): | |
pattern = re.match(r'\w\)', text) | |
return 11, pattern.group(0).replace('\xa0', '') | |
if re.match(r"<\d>", text): | |
pattern = re.match(r"<\d>", text) | |
return 14, pattern.group(0).replace('\xa0', '') | |
if re.search(r':$', text) or re.search(r':$', text): | |
return 10, None | |
if re.findall(r'^Таблица \d+\.?', text): | |
pattern = re.match(r'^Таблица \d+\.?', text) | |
return 12, pattern.group(0).replace('\xa0', '') | |
if re.match(self.exclude_pattern, text): | |
pattern = re.match(self.exclude_pattern, text) | |
# if pattern.regs[0][1] + 2 < len(text): | |
# return None, None | |
self.appendix = True | |
return 13, pattern.group(0).replace('\xa0', '') | |
if re.match(r'^Содержание', text): | |
self.content_flag = True | |
return 15, None | |
if re.match(r'^Предисловие', text): | |
self.preface = True | |
return 16, None | |
return None, None | |
def _find_duplicate_marker(self, level, pattern_text): | |
""" | |
Метод находит одинаковые параграфы в документе. И присваивает дубликату порядковый номер | |
Args: | |
level: Уровень параграфа | |
pattern_text: Паттерн для парсинга и кодирования | |
Returns: | |
Название нового параграфа если дубликат был или название старого если не было дубликата | |
""" | |
if pattern_text is not None: | |
pattern_text = pattern_text.replace(' ', '') | |
if pattern_text in self.duplicate_marker_list and pattern_text[0] not in self.russian_alphabet: | |
if level == 0: | |
self.duplicate_marker_count += 1 | |
pattern_text = f'{pattern_text}:Duplicate{self.duplicate_marker_count}' | |
self.duplicate_marker_list.append(pattern_text) | |
return pattern_text | |
def __find_last_paragraph(self, section): | |
for paragraph_ind in range(section.Paragraphs.Count): | |
paragraph = section.Paragraphs[paragraph_ind] | |
if paragraph.Text == '': | |
continue | |
if paragraph.ListText != '': | |
text = f'{paragraph.ListText} {paragraph.Text}' | |
else: | |
text = paragraph.Text | |
level_paragraph, pattern_text = self._get_pattern(text) | |
if pattern_text and 350 < paragraph_ind < 490 and level_paragraph < 2: | |
pattern_text_zxc = paragraph.Text | |
try: | |
return pattern_text_zxc | |
except: | |
return None | |
def parse_table(self, doc: List, unique_doc_number): | |
self.__init_parameters() | |
flag = True | |
for text in doc: | |
text = text.strip() # удаляет пробелы в начале и конце текста параграфа. | |
text = text.replace('------', '').replace('--', '').replace('\u000b', ' ').replace('\t', ' ') | |
text = text.replace('_', ' ').replace('\u0007', ' ').replace(' ', ' ').replace('', '-') | |
text = text.replace(' ', ' ') | |
if not text: | |
continue | |
if re.match(r'^Т\d?\d$', text) or re.match(r'^T\d?\d$', text): | |
try: | |
last_key = list(self._hierarchy.keys())[-1] | |
last_text = self._hierarchy[last_key] | |
if re.search(r': \d?\d?: $', last_text) or re.search(r': \d?\d?:$', last_text) or last_text == '': | |
self._hierarchy.pop(last_key, None) | |
except IndexError: | |
pass | |
self.table_name = text | |
flag = True | |
elif flag: | |
self._hierarchy[f'{unique_doc_number}_Table{self.table_name}_String{text}'] = '' | |
flag = False | |
else: | |
last_key = list(self._hierarchy.keys())[-1] | |
if text in self._hierarchy[last_key]: | |
continue | |
self._hierarchy[last_key] = f'{self._hierarchy[last_key]} {text}' | |
try: | |
last_key = list(self._hierarchy.keys())[-1] | |
last_text = self._hierarchy[last_key] | |
if re.search(r': \d?\d?: $', last_text) or re.search(r': \d?\d?:$', last_text): | |
self._hierarchy.pop(last_key, None) | |
except IndexError: | |
pass | |
def parse(self, doc: List, unique_doc_number, stop_appendix_list): | |
self.__init_parameters() | |
name_paragraph = None | |
flag = True | |
flag_appendix_stop = False | |
self.unique_count = 0 | |
for text in doc: | |
text = text.strip() # удаляет пробелы в начале и конце текста параграфа. | |
text = text.replace('------', '').replace('--', '').replace('\u000b', ' ').replace('\t', ' ').replace('\ufeff', '') | |
text = text.replace('_', ' ').replace('\u0007', ' ').replace(' ', ' ').replace('', '-').replace('', '') | |
if not text: | |
continue | |
if flag: | |
self._hierarchy[f'{unique_doc_number}_{text.replace("_", " ")}'] = f'{text}' | |
flag = False | |
continue | |
level_paragraph, pattern_text = self._get_pattern(text) # чтобы определить уровень текущего параграфа. | |
if self.preface and not self.content_flag: | |
if level_paragraph == 16: | |
self._hierarchy[f'{unique_doc_number}_Предисловие'] = f'{text}' | |
continue | |
else: | |
self._hierarchy[f'{unique_doc_number}_Предисловие:^PatternText{pattern_text}'] = f'{self._hierarchy[f"{unique_doc_number}_Предисловие"]} {text}' | |
continue | |
if self.content_flag: | |
self.preface = False | |
if level_paragraph == 15: | |
self._hierarchy[f'{unique_doc_number}_Содержание'] = f'{text}' | |
continue | |
elif level_paragraph is not None: | |
if level_paragraph <= 9 and '1' in pattern_text and not re.findall(r'\d+$', text): | |
self.content_flag = False | |
else: | |
self._hierarchy[ | |
f'{unique_doc_number}_Содержание'] = f'{self._hierarchy[f"{unique_doc_number}_Содержание"]} {text}' | |
if level_paragraph == 13: | |
self.appendix = False | |
continue | |
elif text not in self._hierarchy[f'{unique_doc_number}_Содержание']: | |
self._hierarchy[f'{unique_doc_number}_Содержание'] = f'{self._hierarchy[f"{unique_doc_number}_Содержание"]} {text}' | |
continue | |
else: | |
self.content_flag = False | |
if self.appendix and not self.content_flag: | |
if level_paragraph == 13: | |
if pattern_text in stop_appendix_list or pattern_text[-1] in stop_appendix_list: | |
flag_appendix_stop = True | |
continue | |
else: | |
flag_appendix_stop = False | |
if not flag_appendix_stop: | |
self._processing_appendix(text, level_paragraph, pattern_text, unique_doc_number) | |
continue | |
# TODO Проверить работоспособность и перенести в случае чего обратно перед условием про appendix | |
pattern_text = self._find_duplicate_marker(level_paragraph, pattern_text) | |
if level_paragraph is not None: | |
if level_paragraph <= 9: # Пункты 1. 1.1. 2.1 и так далее до 1.1.1.1.1.1.1 | |
name_paragraph = text | |
key = f'{unique_doc_number}_Level{level_paragraph}_PatternText{pattern_text}' | |
self._hierarchy[key] = text | |
self.count = 1 | |
elif level_paragraph == 10: | |
self._processing_without_paragraph(text, unique_doc_number) | |
self.unique_count += 1 | |
self.count = 1 | |
elif level_paragraph == 11: | |
self._processing_special_markers(text, pattern_text, unique_doc_number) | |
self.count += 1 | |
elif level_paragraph == 14: | |
if name_paragraph is not None: | |
level_, pattern_text_ = self._get_pattern(name_paragraph) | |
self._hierarchy[f'{unique_doc_number}_Level{level_}_PatternText{pattern_text_}_PartLevel{pattern_text}'] = f'{text}' | |
else: | |
new_text = text.replace(f'{pattern_text}', '') | |
self._hierarchy[f'{unique_doc_number}_{new_text}'] = f'{text}' | |
self.count = 1 | |
elif level_paragraph == 12: # Обработка Таблиц | |
last_key = list(self._hierarchy.keys())[-self.count] | |
split_key = last_key.split('_') | |
last_lvl = split_key[1] | |
numer_table = pattern_text.replace('Таблица', '').replace('.', '') | |
if len(split_key) != 2: | |
pattern_text_last = last_key.split('_')[2] | |
key = f'{unique_doc_number}_{last_lvl}_{pattern_text_last}_Table{numer_table}' | |
else: | |
pattern_text_last = last_key.split('_')[1] | |
key = f'{unique_doc_number}_{pattern_text_last}_Table{numer_table}' | |
self._hierarchy[key] = text | |
self.count = 1 | |
else: | |
last_key = list(self._hierarchy.keys())[-1] | |
if name_paragraph is not None: | |
split_key = last_key.split('_') | |
if 'Table' in split_key[-1]: | |
level_paragraph, pattern_text = self._get_pattern(name_paragraph) | |
self._hierarchy[f'{unique_doc_number}_Level{level_paragraph}_PatternText{pattern_text}_PartLevel{self.count}'] = f'{text}' | |
self.count += 1 | |
else: | |
self._hierarchy[f'{last_key}'] = f'{self._hierarchy[f"{last_key}"]} {text}' | |
else: | |
self._hierarchy[f'{last_key}'] = f'{self._hierarchy[last_key]} {text}' | |
def hierarchy(self): | |
"""Вернуть иерархию документа""" | |
return self._hierarchy | |
def clear_tmp(root_path): | |
# Проверяем, существует ли папка | |
if not os.path.isdir(root_path): | |
raise FileNotFoundError(f"Папка {root_path} не найдена.") | |
# Перебираем все файлы и папки внутри указанной директории | |
for item in os.listdir(root_path): | |
item_path = os.path.join(root_path, item) | |
# Удаляем папки и их содержимое рекурсивно | |
if os.path.isdir(item_path): | |
shutil.rmtree(item_path) | |
else: | |
# Удаляем файлы | |
os.remove(item_path) | |