Spaces:

RockMi
/

onit-text-analysis

Sleeping

onit-text-analysis / src /indexing /index_data.py

Michela

Upload data and app

e62e0c5 5 months ago

2.63 kB

	"""
	This script creates a Marqo index of preprocessed and original OCR texts. Each page is indexed as a document that is split into 2 sentences long vectors.
	The model used for sentence embedding is https://huggingface.co/flax-sentence-embeddings/all_datasets_v4_mpnet-base.

	Code by Michela Vignoli. Parts of this code were developed with assistance from Simon König.
	"""

	from pprint import pprint
	import csv
	import marqo as mq

	##
	## Connect to Marqo
	##

	MARQO_URL = "http://10.103.251.104:8882"
	marqoClient = mq.Client(url=MARQO_URL)
	#pprint(marqoClient.get_indexes())

	##
	## Index settings
	##

	settings = {
	"textPreprocessing": {
	"splitLength": 2,
	"splitOverlap": 0,
	"splitMethod": "sentence",
	},
	}

	##
	## Ask if index exists, if not create it
	##

	indexName = "onit-sonnini-DHd2025-clean"
	print("Indexname: ", indexName)
	current_indexes = [d["indexName"] for d in marqoClient.get_indexes()["results"]]
	if indexName in current_indexes:
	print(f"Index already exists: {indexName} ")
	# Set indexName as the current index
	print(f"Defaulting to index connection. Index connected: {indexName} ")
	else: # Create a new index
	print(f"Index does not exist: {indexName} ")
	print(f"Creating index: {indexName} ")
	marqoClient.create_index(
	indexName,
	model="flax-sentence-embeddings/all_datasets_v4_mpnet-base",
	settings_dict=settings
	)

	## List of models integrated in Marqo: https://docs.marqo.ai/latest/models/marqo/list-of-models/

	pprint(marqoClient.get_indexes())

	##
	## Load dict of data
	##


	# Load list of dictionaries with each dictionary containing keys: text, barcode, page
	# CSV path
	csv_file = 'data/DHd_index-cleaned.csv'

	# Read data from CSV file into a list of dictionaries
	with open(csv_file, mode='r', encoding='utf-8') as file:
	reader = csv.DictReader(file)
	animal_descriptions = [row for row in reader]

	# Function to clean text by replacing \n with spaces
	def clean_text(text):
	return text.replace('\n', ' ').strip()

	# Clean the 'text' field in each dictionary
	for entry in animal_descriptions:
	entry['text_orig'] = clean_text(entry['text_orig'])
	entry['text_clean'] = clean_text(entry['text_clean'])
	entry['text_prep'] = clean_text(entry['text_prep'])

	pprint(animal_descriptions[:3])

	##
	## Add documents to the index
	##

	print(f"Indexing data...")
	# Define client_batch_size
	client_batch_size = 128

	# Indexing
	marqoClient.index(indexName).add_documents(
	animal_descriptions,
	client_batch_size=client_batch_size,
	tensor_fields=["text_clean"],
	)

	print(f"Data has been indexed in {indexName}")