Spaces:
Sleeping
Sleeping
import json | |
import logging | |
import sys | |
import time | |
from pathlib import Path | |
from elasticsearch import Elasticsearch | |
from tqdm import tqdm | |
ROOT_DIR = Path(__file__).resolve().parent.parent.parent | |
if ROOT_DIR not in sys.path: | |
sys.path.append(str(ROOT_DIR)) | |
def create_index_elastic_people( | |
path: str, | |
logger: logging.Logger | None = None, | |
): | |
if logger is None: | |
logger = logging.getLogger(__name__) | |
# Подключение к Elasticsearch | |
es = Elasticsearch(hosts='localhost:9200') | |
INDEX_NAME = 'people_search' | |
# Удаление старого индекса, если он существует | |
if es.indices.exists(index=INDEX_NAME): | |
es.indices.delete(index=INDEX_NAME) | |
mapping = { | |
"settings": { | |
"analysis": { | |
"char_filter": { | |
"quote_removal": { | |
"type": "pattern_replace", | |
"pattern": "[\"«»]", | |
"replacement": "", | |
} | |
}, | |
"filter": { | |
# "russian_stemmer": { | |
# "type": "stemmer", | |
# "name": "russian" | |
# }, | |
"custom_stopwords": { | |
"type": "stop", | |
"stopwords": [ | |
"кто", | |
"является", | |
"куратором", | |
"руководит", | |
"отвечает", | |
"бизнес", | |
"за что", | |
"ООО", | |
"ОАО", | |
"НН", | |
"персональный", | |
"состав", | |
"персональный", | |
"состав", | |
"Комитета", | |
"ПАО", | |
"ГМК", | |
"Норильский никель", | |
"Рабочей группы", | |
"что", | |
"как", | |
"почему", | |
"зачем", | |
"где", | |
"когда", | |
], | |
} | |
}, | |
"analyzer": { | |
"custom_analyzer": { | |
"type": "custom", | |
"char_filter": ["quote_removal"], | |
"tokenizer": "standard", | |
"filter": [ | |
"lowercase", | |
"custom_stopwords", | |
# "russian_stemmer" | |
], | |
} | |
}, | |
} | |
}, | |
"mappings": { | |
"properties": { | |
"business_processes": { | |
"type": "nested", | |
"properties": { | |
"production_activities_section": { | |
"type": "text", | |
"analyzer": "custom_analyzer", | |
"search_analyzer": "custom_analyzer", | |
}, | |
"processes_name": { | |
"type": "text", | |
"analyzer": "custom_analyzer", | |
"search_analyzer": "custom_analyzer", | |
}, | |
"level_process": { | |
"type": "text", | |
"analyzer": "custom_analyzer", | |
"search_analyzer": "custom_analyzer", | |
}, | |
}, | |
}, | |
"organizatinal_structure": { | |
"type": "nested", | |
"properties": { | |
"position": { | |
"type": "text", | |
"analyzer": "custom_analyzer", | |
"search_analyzer": "custom_analyzer", | |
}, | |
"leads": { | |
"type": "nested", | |
"properties": { | |
"0": { | |
"type": "text", | |
"analyzer": "custom_analyzer", | |
"search_analyzer": "custom_analyzer", | |
}, | |
"1": { | |
"type": "text", | |
"analyzer": "custom_analyzer", | |
"search_analyzer": "custom_analyzer", | |
}, | |
}, | |
}, | |
"subordinate": { | |
"type": "object", | |
"properties": { | |
"person_name": { | |
"type": "text", | |
"analyzer": "custom_analyzer", | |
"search_analyzer": "custom_analyzer", | |
}, | |
"position": { | |
"type": "text", | |
"analyzer": "custom_analyzer", | |
"search_analyzer": "custom_analyzer", | |
}, | |
}, | |
}, | |
}, | |
}, | |
"business_curator": { | |
"type": "nested", | |
"properties": { | |
"division": { | |
"type": "text", | |
"analyzer": "custom_analyzer", | |
"search_analyzer": "custom_analyzer", | |
}, | |
"company_name": { | |
"type": "text", | |
"analyzer": "custom_analyzer", | |
"search_analyzer": "custom_analyzer", | |
}, | |
}, | |
}, | |
"groups": { | |
"type": "nested", | |
"properties": { | |
"group_name": { | |
"type": "text", | |
"analyzer": "custom_analyzer", | |
"search_analyzer": "custom_analyzer", | |
}, | |
"position_in_group": { | |
"type": "text", | |
"analyzer": "custom_analyzer", | |
"search_analyzer": "custom_analyzer", | |
}, | |
"block": {"type": "keyword", "null_value": "unknown"}, | |
}, | |
}, | |
"person_name": { | |
"type": "text", | |
"analyzer": "custom_analyzer", | |
"search_analyzer": "custom_analyzer", | |
}, | |
} | |
}, | |
} | |
# Создание индекса с указанным маппингом | |
es.indices.create(index=INDEX_NAME, body=mapping) | |
group_names = [] | |
for ind, path in tqdm(enumerate(Path(path).iterdir())): | |
# Открываем файл и читаем его содержимое | |
try: | |
with open(path, 'r', encoding='utf-8') as file: | |
data = json.load(file) | |
# Индексирование документа в Elasticsearch | |
es.index(index=INDEX_NAME, id=ind + 1, body=data) | |
time.sleep(0.5) | |
except: | |
print(f"Ошибка при чтении или добавлении файла {path.name} в индекс") | |
if es.indices.exists(index=INDEX_NAME): | |
print(f"Index '{INDEX_NAME}' exists.") | |
# Подсчет количества документов в индексе | |
count_response = es.count(index=INDEX_NAME) | |
print(f"Total documents in '{INDEX_NAME}': {count_response['count']}") | |
def get_elastic_people_query(query): | |
has_business_curator = ( | |
"бизнес куратор" in query.lower() or "бизнес-куратор" in query.lower() | |
) | |
business_curator_boost = 20 if has_business_curator else 15 | |
return { | |
"query": { | |
"function_score": { | |
"query": { | |
"bool": { | |
"should": [ | |
{ | |
"multi_match": { | |
"query": query, | |
"fields": ["person_name^3"], | |
"fuzziness": "AUTO", | |
"analyzer": "custom_analyzer", | |
} | |
}, | |
{ | |
"nested": { | |
"path": "business_processes", | |
"query": { | |
"multi_match": { | |
"query": query, | |
"fields": [ | |
"business_processes.production_activities_section", | |
"business_processes.processes_name", | |
], | |
"fuzziness": "AUTO", | |
"analyzer": "custom_analyzer", | |
} | |
}, | |
} | |
}, | |
{ | |
"nested": { | |
"path": "organizatinal_structure", | |
"query": { | |
"multi_match": { | |
"query": query, | |
"fields": [ | |
"organizatinal_structure.position^2" | |
], | |
"fuzziness": "AUTO", | |
"analyzer": "custom_analyzer", | |
} | |
}, | |
} | |
}, | |
{ | |
"nested": { | |
"path": "business_curator", | |
"query": { | |
"multi_match": { | |
"query": query, | |
"fields": [ | |
f"business_curator.company_name^{business_curator_boost}" | |
], | |
"fuzziness": "AUTO", | |
"analyzer": "custom_analyzer", | |
} | |
}, | |
} | |
}, | |
] | |
} | |
} | |
} | |
} | |
} | |
query = 'кто бизнес куратор ООО Медвежий ручей?' | |
# Выполнение поиска в Elasticsearch | |
response = es.search(index=INDEX_NAME, body=get_elastic_people_query(query), size=2) | |
logger.info(f"Number of hits: {response['hits']['total']['value']}") | |
# Вывод результата поиска | |
for hit in response['hits']['hits']: | |
logger.info(hit['_source']) | |
if __name__ == '__main__': | |
path = '/mnt/ntr_work/data/фывфыаыфвфы/person_card' | |
create_index_elastic_people(path) | |