Spaces:

muryshev
/

generic-chatbot-backend

Runtime error

App Files Files

xet

Community

generic-chatbot-backend / components /elastic /create_index_elastic_abbreviation.py

muryshev

init

57cf043 7 months ago

raw

history blame

2.67 kB

	import logging

	import pandas as pd
	from elasticsearch import Elasticsearch
	from tqdm import tqdm


	def create_index_elastic_abbreviation(
	df: pd.DataFrame,
	logger: logging.Logger \| None,
	):
	if logger is None:
	logger = logging.getLogger(__name__)

	# Подключение к Elasticsearch
	es = Elasticsearch(hosts='localhost:9200')

	INDEX_NAME = 'nmd_abbreviation_elastic'

	# Удаление старого индекса, если он существует
	if es.indices.exists(index=INDEX_NAME):
	es.indices.delete(index=INDEX_NAME)

	mapping = {
	"mappings": {
	"properties": {
	"abbreviation": {"type": "text", "analyzer": "russian"},
	"text": {"type": "text", "analyzer": "russian"},
	}
	}
	}

	# Создание индекса с указанным маппингом
	es.indices.create(index=INDEX_NAME, body=mapping)

	# Индексация документов
	for ind, row in tqdm(df.iterrows()):
	document = {'abbreviation': row['name'], 'text': row['definition']}

	# Индексирование документа в Elasticsearch
	es.index(index=INDEX_NAME, id=ind, body=document)

	if es.indices.exists(index=INDEX_NAME):
	logger.info(f"Index '{INDEX_NAME}' exists.")

	# # Подсчет количества документов в индексе
	count_response = es.count(index=INDEX_NAME)
	logger.info(f"Total documents in '{INDEX_NAME}': {count_response['count']}")

	# Поиск документов, где поле "person_full_name" содержит определенное значение "Александров Д.В."
	query = {
	"query": {
	"multi_match": {
	"query": "для нужен стандарт управления бизнес процессами компании?",
	"fuzziness": "AUTO",
	"minimum_should_match": "83%",
	"fields": ["text"],
	}
	},
	"highlight": {"fields": {"text": {}}},
	}

	# Выполнение поиска в Elasticsearch
	response = es.search(index=INDEX_NAME, body=query, size=1)
	logger.info(f"Number of hits: {response['hits']['total']['value']}")

	# Вывод результата поиска
	for hit in response['hits']['hits']:
	logger.info(hit)
	logger.info('=====')


	if __name__ == '__main__':
	# Чтение CSV файла с данными
	df = pd.read_csv('/mnt/ntr_work/project/nmd800/data/abbreviations.csv')

	create_index_elastic_abbreviation(df)