generic-chatbot-backend / components /elastic /create_index_elastic.py
muryshev's picture
init
57cf043
raw
history blame
12.6 kB
import json
import logging
import sys
import time
from pathlib import Path
from elasticsearch import Elasticsearch
from tqdm import tqdm
ROOT_DIR = Path(__file__).resolve().parent.parent.parent
if ROOT_DIR not in sys.path:
sys.path.append(str(ROOT_DIR))
def create_index_elastic_people(
path: str,
logger: logging.Logger | None = None,
):
if logger is None:
logger = logging.getLogger(__name__)
# Подключение к Elasticsearch
es = Elasticsearch(hosts='localhost:9200')
INDEX_NAME = 'people_search'
# Удаление старого индекса, если он существует
if es.indices.exists(index=INDEX_NAME):
es.indices.delete(index=INDEX_NAME)
mapping = {
"settings": {
"analysis": {
"char_filter": {
"quote_removal": {
"type": "pattern_replace",
"pattern": "[\"«»]",
"replacement": "",
}
},
"filter": {
# "russian_stemmer": {
# "type": "stemmer",
# "name": "russian"
# },
"custom_stopwords": {
"type": "stop",
"stopwords": [
"кто",
"является",
"куратором",
"руководит",
"отвечает",
"бизнес",
"за что",
"ООО",
"ОАО",
"НН",
"персональный",
"состав",
"персональный",
"состав",
"Комитета",
"ПАО",
"ГМК",
"Норильский никель",
"Рабочей группы",
"что",
"как",
"почему",
"зачем",
"где",
"когда",
],
}
},
"analyzer": {
"custom_analyzer": {
"type": "custom",
"char_filter": ["quote_removal"],
"tokenizer": "standard",
"filter": [
"lowercase",
"custom_stopwords",
# "russian_stemmer"
],
}
},
}
},
"mappings": {
"properties": {
"business_processes": {
"type": "nested",
"properties": {
"production_activities_section": {
"type": "text",
"analyzer": "custom_analyzer",
"search_analyzer": "custom_analyzer",
},
"processes_name": {
"type": "text",
"analyzer": "custom_analyzer",
"search_analyzer": "custom_analyzer",
},
"level_process": {
"type": "text",
"analyzer": "custom_analyzer",
"search_analyzer": "custom_analyzer",
},
},
},
"organizatinal_structure": {
"type": "nested",
"properties": {
"position": {
"type": "text",
"analyzer": "custom_analyzer",
"search_analyzer": "custom_analyzer",
},
"leads": {
"type": "nested",
"properties": {
"0": {
"type": "text",
"analyzer": "custom_analyzer",
"search_analyzer": "custom_analyzer",
},
"1": {
"type": "text",
"analyzer": "custom_analyzer",
"search_analyzer": "custom_analyzer",
},
},
},
"subordinate": {
"type": "object",
"properties": {
"person_name": {
"type": "text",
"analyzer": "custom_analyzer",
"search_analyzer": "custom_analyzer",
},
"position": {
"type": "text",
"analyzer": "custom_analyzer",
"search_analyzer": "custom_analyzer",
},
},
},
},
},
"business_curator": {
"type": "nested",
"properties": {
"division": {
"type": "text",
"analyzer": "custom_analyzer",
"search_analyzer": "custom_analyzer",
},
"company_name": {
"type": "text",
"analyzer": "custom_analyzer",
"search_analyzer": "custom_analyzer",
},
},
},
"groups": {
"type": "nested",
"properties": {
"group_name": {
"type": "text",
"analyzer": "custom_analyzer",
"search_analyzer": "custom_analyzer",
},
"position_in_group": {
"type": "text",
"analyzer": "custom_analyzer",
"search_analyzer": "custom_analyzer",
},
"block": {"type": "keyword", "null_value": "unknown"},
},
},
"person_name": {
"type": "text",
"analyzer": "custom_analyzer",
"search_analyzer": "custom_analyzer",
},
}
},
}
# Создание индекса с указанным маппингом
es.indices.create(index=INDEX_NAME, body=mapping)
group_names = []
for ind, path in tqdm(enumerate(Path(path).iterdir())):
# Открываем файл и читаем его содержимое
try:
with open(path, 'r', encoding='utf-8') as file:
data = json.load(file)
# Индексирование документа в Elasticsearch
es.index(index=INDEX_NAME, id=ind + 1, body=data)
time.sleep(0.5)
except:
print(f"Ошибка при чтении или добавлении файла {path.name} в индекс")
if es.indices.exists(index=INDEX_NAME):
print(f"Index '{INDEX_NAME}' exists.")
# Подсчет количества документов в индексе
count_response = es.count(index=INDEX_NAME)
print(f"Total documents in '{INDEX_NAME}': {count_response['count']}")
def get_elastic_people_query(query):
has_business_curator = (
"бизнес куратор" in query.lower() or "бизнес-куратор" in query.lower()
)
business_curator_boost = 20 if has_business_curator else 15
return {
"query": {
"function_score": {
"query": {
"bool": {
"should": [
{
"multi_match": {
"query": query,
"fields": ["person_name^3"],
"fuzziness": "AUTO",
"analyzer": "custom_analyzer",
}
},
{
"nested": {
"path": "business_processes",
"query": {
"multi_match": {
"query": query,
"fields": [
"business_processes.production_activities_section",
"business_processes.processes_name",
],
"fuzziness": "AUTO",
"analyzer": "custom_analyzer",
}
},
}
},
{
"nested": {
"path": "organizatinal_structure",
"query": {
"multi_match": {
"query": query,
"fields": [
"organizatinal_structure.position^2"
],
"fuzziness": "AUTO",
"analyzer": "custom_analyzer",
}
},
}
},
{
"nested": {
"path": "business_curator",
"query": {
"multi_match": {
"query": query,
"fields": [
f"business_curator.company_name^{business_curator_boost}"
],
"fuzziness": "AUTO",
"analyzer": "custom_analyzer",
}
},
}
},
]
}
}
}
}
}
query = 'кто бизнес куратор ООО Медвежий ручей?'
# Выполнение поиска в Elasticsearch
response = es.search(index=INDEX_NAME, body=get_elastic_people_query(query), size=2)
logger.info(f"Number of hits: {response['hits']['total']['value']}")
# Вывод результата поиска
for hit in response['hits']['hits']:
logger.info(hit['_source'])
if __name__ == '__main__':
path = '/mnt/ntr_work/data/фывфыаыфвфы/person_card'
create_index_elastic_people(path)