Spaces:

muryshev
/

generic-chatbot-backend

Sleeping

App Files Files Community

generic-chatbot-backend / components /elastic /create_index_elastic.py

muryshev

init

57cf043 4 months ago

raw

history blame

12.6 kB

	import json
	import logging
	import sys
	import time
	from pathlib import Path

	from elasticsearch import Elasticsearch
	from tqdm import tqdm

	ROOT_DIR = Path(__file__).resolve().parent.parent.parent
	if ROOT_DIR not in sys.path:
	sys.path.append(str(ROOT_DIR))


	def create_index_elastic_people(
	path: str,
	logger: logging.Logger \| None = None,
	):
	if logger is None:
	logger = logging.getLogger(__name__)

	# Подключение к Elasticsearch
	es = Elasticsearch(hosts='localhost:9200')
	INDEX_NAME = 'people_search'

	# Удаление старого индекса, если он существует
	if es.indices.exists(index=INDEX_NAME):
	es.indices.delete(index=INDEX_NAME)

	mapping = {
	"settings": {
	"analysis": {
	"char_filter": {
	"quote_removal": {
	"type": "pattern_replace",
	"pattern": "[\"«»]",
	"replacement": "",
	}
	},
	"filter": {
	# "russian_stemmer": {
	# "type": "stemmer",
	# "name": "russian"
	# },
	"custom_stopwords": {
	"type": "stop",
	"stopwords": [
	"кто",
	"является",
	"куратором",
	"руководит",
	"отвечает",
	"бизнес",
	"за что",
	"ООО",
	"ОАО",
	"НН",
	"персональный",
	"состав",
	"персональный",
	"состав",
	"Комитета",
	"ПАО",
	"ГМК",
	"Норильский никель",
	"Рабочей группы",
	"что",
	"как",
	"почему",
	"зачем",
	"где",
	"когда",
	],
	}
	},
	"analyzer": {
	"custom_analyzer": {
	"type": "custom",
	"char_filter": ["quote_removal"],
	"tokenizer": "standard",
	"filter": [
	"lowercase",
	"custom_stopwords",
	# "russian_stemmer"
	],
	}
	},
	}
	},
	"mappings": {
	"properties": {
	"business_processes": {
	"type": "nested",
	"properties": {
	"production_activities_section": {
	"type": "text",
	"analyzer": "custom_analyzer",
	"search_analyzer": "custom_analyzer",
	},
	"processes_name": {
	"type": "text",
	"analyzer": "custom_analyzer",
	"search_analyzer": "custom_analyzer",
	},
	"level_process": {
	"type": "text",
	"analyzer": "custom_analyzer",
	"search_analyzer": "custom_analyzer",
	},
	},
	},
	"organizatinal_structure": {
	"type": "nested",
	"properties": {
	"position": {
	"type": "text",
	"analyzer": "custom_analyzer",
	"search_analyzer": "custom_analyzer",
	},
	"leads": {
	"type": "nested",
	"properties": {
	"0": {
	"type": "text",
	"analyzer": "custom_analyzer",
	"search_analyzer": "custom_analyzer",
	},
	"1": {
	"type": "text",
	"analyzer": "custom_analyzer",
	"search_analyzer": "custom_analyzer",
	},
	},
	},
	"subordinate": {
	"type": "object",
	"properties": {
	"person_name": {
	"type": "text",
	"analyzer": "custom_analyzer",
	"search_analyzer": "custom_analyzer",
	},
	"position": {
	"type": "text",
	"analyzer": "custom_analyzer",
	"search_analyzer": "custom_analyzer",
	},
	},
	},
	},
	},
	"business_curator": {
	"type": "nested",
	"properties": {
	"division": {
	"type": "text",
	"analyzer": "custom_analyzer",
	"search_analyzer": "custom_analyzer",
	},
	"company_name": {
	"type": "text",
	"analyzer": "custom_analyzer",
	"search_analyzer": "custom_analyzer",
	},
	},
	},
	"groups": {
	"type": "nested",
	"properties": {
	"group_name": {
	"type": "text",
	"analyzer": "custom_analyzer",
	"search_analyzer": "custom_analyzer",
	},
	"position_in_group": {
	"type": "text",
	"analyzer": "custom_analyzer",
	"search_analyzer": "custom_analyzer",
	},
	"block": {"type": "keyword", "null_value": "unknown"},
	},
	},
	"person_name": {
	"type": "text",
	"analyzer": "custom_analyzer",
	"search_analyzer": "custom_analyzer",
	},
	}
	},
	}
	# Создание индекса с указанным маппингом
	es.indices.create(index=INDEX_NAME, body=mapping)

	group_names = []
	for ind, path in tqdm(enumerate(Path(path).iterdir())):
	# Открываем файл и читаем его содержимое
	try:
	with open(path, 'r', encoding='utf-8') as file:
	data = json.load(file)

	# Индексирование документа в Elasticsearch
	es.index(index=INDEX_NAME, id=ind + 1, body=data)
	time.sleep(0.5)
	except:
	print(f"Ошибка при чтении или добавлении файла {path.name} в индекс")

	if es.indices.exists(index=INDEX_NAME):
	print(f"Index '{INDEX_NAME}' exists.")

	# Подсчет количества документов в индексе
	count_response = es.count(index=INDEX_NAME)
	print(f"Total documents in '{INDEX_NAME}': {count_response['count']}")

	def get_elastic_people_query(query):
	has_business_curator = (
	"бизнес куратор" in query.lower() or "бизнес-куратор" in query.lower()
	)
	business_curator_boost = 20 if has_business_curator else 15
	return {
	"query": {
	"function_score": {
	"query": {
	"bool": {
	"should": [
	{
	"multi_match": {
	"query": query,
	"fields": ["person_name^3"],
	"fuzziness": "AUTO",
	"analyzer": "custom_analyzer",
	}
	},
	{
	"nested": {
	"path": "business_processes",
	"query": {
	"multi_match": {
	"query": query,
	"fields": [
	"business_processes.production_activities_section",
	"business_processes.processes_name",
	],
	"fuzziness": "AUTO",
	"analyzer": "custom_analyzer",
	}
	},
	}
	},
	{
	"nested": {
	"path": "organizatinal_structure",
	"query": {
	"multi_match": {
	"query": query,
	"fields": [
	"organizatinal_structure.position^2"
	],
	"fuzziness": "AUTO",
	"analyzer": "custom_analyzer",
	}
	},
	}
	},
	{
	"nested": {
	"path": "business_curator",
	"query": {
	"multi_match": {
	"query": query,
	"fields": [
	f"business_curator.company_name^{business_curator_boost}"
	],
	"fuzziness": "AUTO",
	"analyzer": "custom_analyzer",
	}
	},
	}
	},
	]
	}
	}
	}
	}
	}

	query = 'кто бизнес куратор ООО Медвежий ручей?'
	# Выполнение поиска в Elasticsearch
	response = es.search(index=INDEX_NAME, body=get_elastic_people_query(query), size=2)
	logger.info(f"Number of hits: {response['hits']['total']['value']}")

	# Вывод результата поиска
	for hit in response['hits']['hits']:
	logger.info(hit['_source'])


	if __name__ == '__main__':
	path = '/mnt/ntr_work/data/фывфыаыфвфы/person_card'
	create_index_elastic_people(path)