import json import logging import sys import time from pathlib import Path from elasticsearch import Elasticsearch from tqdm import tqdm ROOT_DIR = Path(__file__).resolve().parent.parent.parent if ROOT_DIR not in sys.path: sys.path.append(str(ROOT_DIR)) def create_index_elastic_people( path: str, logger: logging.Logger | None = None, ): if logger is None: logger = logging.getLogger(__name__) # Подключение к Elasticsearch es = Elasticsearch(hosts='localhost:9200') INDEX_NAME = 'people_search' # Удаление старого индекса, если он существует if es.indices.exists(index=INDEX_NAME): es.indices.delete(index=INDEX_NAME) mapping = { "settings": { "analysis": { "char_filter": { "quote_removal": { "type": "pattern_replace", "pattern": "[\"«»]", "replacement": "", } }, "filter": { # "russian_stemmer": { # "type": "stemmer", # "name": "russian" # }, "custom_stopwords": { "type": "stop", "stopwords": [ "кто", "является", "куратором", "руководит", "отвечает", "бизнес", "за что", "ООО", "ОАО", "НН", "персональный", "состав", "персональный", "состав", "Комитета", "ПАО", "ГМК", "Норильский никель", "Рабочей группы", "что", "как", "почему", "зачем", "где", "когда", ], } }, "analyzer": { "custom_analyzer": { "type": "custom", "char_filter": ["quote_removal"], "tokenizer": "standard", "filter": [ "lowercase", "custom_stopwords", # "russian_stemmer" ], } }, } }, "mappings": { "properties": { "business_processes": { "type": "nested", "properties": { "production_activities_section": { "type": "text", "analyzer": "custom_analyzer", "search_analyzer": "custom_analyzer", }, "processes_name": { "type": "text", "analyzer": "custom_analyzer", "search_analyzer": "custom_analyzer", }, "level_process": { "type": "text", "analyzer": "custom_analyzer", "search_analyzer": "custom_analyzer", }, }, }, "organizatinal_structure": { "type": "nested", "properties": { "position": { "type": "text", "analyzer": "custom_analyzer", "search_analyzer": "custom_analyzer", }, "leads": { "type": "nested", "properties": { "0": { "type": "text", "analyzer": "custom_analyzer", "search_analyzer": "custom_analyzer", }, "1": { "type": "text", "analyzer": "custom_analyzer", "search_analyzer": "custom_analyzer", }, }, }, "subordinate": { "type": "object", "properties": { "person_name": { "type": "text", "analyzer": "custom_analyzer", "search_analyzer": "custom_analyzer", }, "position": { "type": "text", "analyzer": "custom_analyzer", "search_analyzer": "custom_analyzer", }, }, }, }, }, "business_curator": { "type": "nested", "properties": { "division": { "type": "text", "analyzer": "custom_analyzer", "search_analyzer": "custom_analyzer", }, "company_name": { "type": "text", "analyzer": "custom_analyzer", "search_analyzer": "custom_analyzer", }, }, }, "groups": { "type": "nested", "properties": { "group_name": { "type": "text", "analyzer": "custom_analyzer", "search_analyzer": "custom_analyzer", }, "position_in_group": { "type": "text", "analyzer": "custom_analyzer", "search_analyzer": "custom_analyzer", }, "block": {"type": "keyword", "null_value": "unknown"}, }, }, "person_name": { "type": "text", "analyzer": "custom_analyzer", "search_analyzer": "custom_analyzer", }, } }, } # Создание индекса с указанным маппингом es.indices.create(index=INDEX_NAME, body=mapping) group_names = [] for ind, path in tqdm(enumerate(Path(path).iterdir())): # Открываем файл и читаем его содержимое try: with open(path, 'r', encoding='utf-8') as file: data = json.load(file) # Индексирование документа в Elasticsearch es.index(index=INDEX_NAME, id=ind + 1, body=data) time.sleep(0.5) except: print(f"Ошибка при чтении или добавлении файла {path.name} в индекс") if es.indices.exists(index=INDEX_NAME): print(f"Index '{INDEX_NAME}' exists.") # Подсчет количества документов в индексе count_response = es.count(index=INDEX_NAME) print(f"Total documents in '{INDEX_NAME}': {count_response['count']}") def get_elastic_people_query(query): has_business_curator = ( "бизнес куратор" in query.lower() or "бизнес-куратор" in query.lower() ) business_curator_boost = 20 if has_business_curator else 15 return { "query": { "function_score": { "query": { "bool": { "should": [ { "multi_match": { "query": query, "fields": ["person_name^3"], "fuzziness": "AUTO", "analyzer": "custom_analyzer", } }, { "nested": { "path": "business_processes", "query": { "multi_match": { "query": query, "fields": [ "business_processes.production_activities_section", "business_processes.processes_name", ], "fuzziness": "AUTO", "analyzer": "custom_analyzer", } }, } }, { "nested": { "path": "organizatinal_structure", "query": { "multi_match": { "query": query, "fields": [ "organizatinal_structure.position^2" ], "fuzziness": "AUTO", "analyzer": "custom_analyzer", } }, } }, { "nested": { "path": "business_curator", "query": { "multi_match": { "query": query, "fields": [ f"business_curator.company_name^{business_curator_boost}" ], "fuzziness": "AUTO", "analyzer": "custom_analyzer", } }, } }, ] } } } } } query = 'кто бизнес куратор ООО Медвежий ручей?' # Выполнение поиска в Elasticsearch response = es.search(index=INDEX_NAME, body=get_elastic_people_query(query), size=2) logger.info(f"Number of hits: {response['hits']['total']['value']}") # Вывод результата поиска for hit in response['hits']['hits']: logger.info(hit['_source']) if __name__ == '__main__': path = '/mnt/ntr_work/data/фывфыаыфвфы/person_card' create_index_elastic_people(path)