Spaces:
Sleeping
Sleeping
from typing import List, Tuple, Optional | |
import pandas as pd | |
class MetadataManager: | |
def __init__(self, df: pd.DataFrame, logger): | |
self.logger = logger | |
self.df = df | |
self.df.drop('Embedding', axis=1, inplace=True) | |
self.df = self.df.where(pd.notna(self.df), 'unknown') | |
def __search_sub_level(df: pd.DataFrame, header_text: Optional[str] = None) -> List: | |
""" | |
Args: | |
df: | |
Returns: | |
""" | |
paragraphs = [] | |
if header_text is None: | |
header_text = df.iloc[0]['Text'] | |
for ind, (_, row) in enumerate(df.iterrows()): | |
text = row['Text'] | |
if ind == 0: | |
text = text.replace(f'{header_text}', f'{header_text}\n') | |
else: | |
text = text.replace(f'{header_text}', '') + '\n' | |
paragraphs.append(text) | |
return paragraphs | |
def __check_duplicates(df: pd.DataFrame, ind: int) -> pd.DataFrame: | |
if df.loc[ind]['Duplicate'] is not None: | |
return df[df['Duplicate'] == df.loc[ind]['Duplicate']] | |
else: | |
return df[df['Duplicate'].isna()] | |
def __check_appendix_duplicates(df: pd.DataFrame, ind: int) -> pd.DataFrame: | |
if df.loc[ind]['DuplicateAppendix'] is not None: | |
return df[df['DuplicateAppendix'] == df.loc[ind]['DuplicateAppendix']] | |
else: | |
return df[df['DuplicateAppendix'].isna()] | |
def _paragraph_appendix_content(self, df, pattern: str, ind: int, shape: int) -> Tuple[List, int]: | |
""" | |
Функция возвращает контент параграфа. Если в параграфе были подпункты через "-" или буквы "а, б" | |
Args: | |
df: DataFrame | |
pattern: Паттерн поиска. | |
ind: Индекс строки в DataFrame. | |
shape: Размер DataFrame при котором будет возвращаться пустой список. | |
Returns: | |
Возвращает список подразделов. | |
Examples: | |
3.1. Параграф: | |
1) - Содержание 1; | |
2) - Содержание 2; | |
3) - Содержание 3; | |
""" | |
df = df[(df['PargaraphAppendix'].str.match(pattern, na=False)) | (df.index == ind)] | |
df = self.__check_appendix_duplicates(df, ind) | |
if df.shape[0] <= shape: | |
return [], None | |
start_index_paragraph = df.index[0] | |
paragraphs = self.__search_sub_level(df) | |
return paragraphs, start_index_paragraph | |
def _paragraph_content(self, df, pattern: str, ind: int, shape: int) -> Tuple[List, int]: | |
""" | |
Функция возвращает контент параграфа. Если в параграфе были подпункты через "-" или буквы "а, б" | |
Args: | |
df: DataFrame | |
pattern: Паттерн поиска. | |
ind: Индекс строки в DataFrame. | |
shape: Размер DataFrame при котором будет возвращаться пустой список. | |
Returns: | |
Возвращает список подразделов. | |
Examples: | |
3.1. Параграф: | |
1) - Содержание 1; | |
2) - Содержание 2; | |
3) - Содержание 3; | |
""" | |
df = df[ | |
(df['Pargaraph'].str.match(pattern, na=False)) & # Проверка, соответствуют ли значения паттерну | |
(df['Duplicate'] == df.loc[ind]['Duplicate']) | # Оставить разделы только принадлежащие одному дубликату | |
(df.index == ind)] # Оставить значение, которое нашел векторный поиск | |
# df = self.__check_duplicates(df, ind) | |
if df.shape[0] <= shape: | |
return [], None | |
start_index_paragraph = df.index[0] | |
paragraphs = self.__search_sub_level(df) | |
return paragraphs, start_index_paragraph | |
def _paragraph_content2(self, df, pattern: str, ind: int, shape: int) -> Tuple[List, int]: | |
""" | |
Функция возвращает контент параграфа. Если в параграфе были подпункты через "-" или буквы "а, б" | |
Args: | |
df: DataFrame | |
pattern: Паттерн поиска. | |
ind: Индекс строки в DataFrame. | |
shape: Размер DataFrame при котором будет возвращаться пустой список. | |
Returns: | |
Возвращает список подразделов. | |
Examples: | |
3.1. Параграф: | |
1) - Содержание 1; | |
2) - Содержание 2; | |
3) - Содержание 3; | |
""" | |
df = df[df['Pargaraph'].str.match(pattern, na=False)] | |
if df.shape[0] <= shape: | |
return [], None | |
# df = self.__check_duplicates(df, ind) | |
# if df.shape[0] <= shape: | |
# return [], None | |
start_index_paragraph = df.index[0] | |
paragraphs = self.__search_sub_level(df) | |
return paragraphs, start_index_paragraph | |
def _first_unknown_index(df): | |
indexes = list(df[df['PartLevel1'].isin(['unknown'])].index) | |
if len(indexes) > 0: | |
return df.loc[indexes[-1]]['Text'] | |
else: | |
return None | |
def _search_other_info(self, ind, doc_number): | |
df = self.df[self.df['DocNumber'] == doc_number] | |
start_index_paragraph = df.loc[ind]['Index'] - 1 | |
if df.loc[ind]['Table'] != 'unknown': | |
return df.loc[ind]['Text'], ind | |
if df.loc[ind]['PartLevel1'] != 'unknown': | |
if 'Table' in str(self.df.iloc[ind]['PartLevel1']): | |
return [], ind | |
if df.loc[ind]['Appendix'] != 'unknown': | |
df = df[df['Appendix'] == self.df.iloc[ind]['Appendix']] | |
if df.loc[ind]['LevelParagraphAppendix'] == 'unknown' and df.loc[ind]['PargaraphAppendix'] == 'unknown': | |
# pattern = r'\d+\.?$' | |
# df = df[(df['PargaraphAppendix'].str.match(pattern, na=False)) | (df.index == ind)] | |
# df = df[(df['LevelParagraphAppendix'] == 'Level0') | (df.index == ind)] | |
df = df.loc[ind:ind + 7] | |
start_index_paragraph = df.index[0] | |
paragraph = self.__search_sub_level(df) | |
elif df.loc[ind]['PargaraphAppendix'] != 'unknown': | |
pattern = df.loc[ind]["PargaraphAppendix"].replace(".", r"\.") | |
pattern = f'^{pattern}?\\d?.?$' | |
if df[df['PargaraphAppendix'].str.match(pattern, na=False)].shape[0] == 1: | |
pattern = df.loc[ind]["PargaraphAppendix"].replace(".", r"\.") | |
pattern = pattern.split('.') | |
pattern = [elem for elem in pattern if elem] | |
if len(pattern) == 1: | |
pattern = '.'.join(pattern) | |
pattern = f'^{pattern}.?\\d?.?$' | |
else: | |
pattern = '.'.join(pattern[:-1]) | |
pattern = f'^{pattern}.\\d.?$' | |
df = df[df['PargaraphAppendix'].str.match(pattern, na=False)] | |
start_index_paragraph = df.index[0] | |
paragraph = self.__search_sub_level(df) | |
else: | |
paragraph = self.df.iloc[int(ind - 10):ind + 10]['Text'].values | |
start_index_paragraph = df.index[0] | |
return ' '.join(paragraph), start_index_paragraph | |
else: | |
if df.loc[ind]['Pargaraph'] == 'unknown': | |
header_text = self._first_unknown_index(df) | |
df = df.loc[int(ind - 2):ind + 2] | |
paragraph = self.__search_sub_level(df, header_text) | |
# Связан с документами без пунктов поэтому передается несколько параграфов сверху и снизу | |
else: | |
pattern = df.loc[ind]["Pargaraph"].replace(".", r"\.") | |
# Изет под пункты внутри пункта | |
paragraph, start_index_paragraph = self._paragraph_content(df, fr'^{pattern}?$', ind, 2) | |
if len(paragraph) == 0: | |
pattern = f'{pattern}\\d?.?\\d?\\d?.?$' | |
paragraph, start_index_paragraph = self._paragraph_content2(df, pattern, ind, 0) | |
if len(paragraph) == 0 and df.loc[ind]['LevelParagraph'] != '0': | |
pattern = df.loc[ind]["Pargaraph"].split('.') | |
pattern = [elem for elem in pattern if elem] | |
pattern = '.'.join(pattern[:-1]) | |
pattern = f'^{pattern}\\.\\d\\d?.?$' | |
paragraph, start_index_paragraph = self._paragraph_content(df, pattern, ind, 0) | |
elif len(paragraph) == 0 and df.loc[ind]['LevelParagraph'] == '0': | |
pattern = df.loc[ind]["Pargaraph"].replace(".", r"\.") | |
if '.' not in pattern: | |
pattern = pattern + '\.' | |
pattern = f'^{pattern}\\d.?\\d?.?$' | |
paragraph, start_index_paragraph = self._paragraph_content(df, pattern, ind, 0) | |
return ' '.join(paragraph), start_index_paragraph | |
def filter_answer(answer): | |
flip_answer = [] | |
new_answer = {} | |
count = 0 | |
for key in answer: | |
if answer[key]['start_index_paragraph'] not in flip_answer: | |
flip_answer.append(answer[key]['start_index_paragraph']) | |
new_answer[count] = answer[key] | |
count += 1 | |
return new_answer | |
def _clear_doc_name(self, ind): | |
split_doc_name = self.df.iloc[ind]['DocName'].split('_') | |
return ' '.join(split_doc_name[1:]).replace('.txt', '').replace('.json', '').replace('.DOCX', '').replace( | |
'.DOC', '').replace('tables', '') | |
def search(self, indexes: List) -> dict: | |
""" | |
Метод ищет ответы на запрос | |
Args: | |
indexes: Список индексов. | |
Returns: | |
Возвращает словарь с ответами и информацией об ответах. | |
""" | |
answers = {} | |
for i, ind in enumerate(indexes): | |
answers[i] = {} | |
doc_number = self.df.iloc[ind]['DocNumber'] | |
answers[i]['id'] = doc_number | |
answers[i][f'index_answer'] = int(ind) | |
answers[i][f'doc_name'] = self._clear_doc_name(ind) | |
answers[i][f'title'] = self.df.iloc[ind]['Title'] | |
answers[i][f'text_answer'] = self.df.iloc[ind]['Text'] | |
try: | |
other_info, start_index_paragraph = self._search_other_info(ind, doc_number) | |
except KeyError: | |
other_info, start_index_paragraph = self.df.iloc[ind]['Text'], ind | |
self.logger.info('Ошибка в индексе, проверьте БД!') | |
if len(other_info) == 0: | |
other_info, start_index_paragraph = self.df.iloc[ind]['Text'], ind | |
answers[i][f'other_info'] = [other_info] | |
answers[i][f'start_index_paragraph'] = int(start_index_paragraph) | |
return self.filter_answer(answers) | |