""" This script creates a Marqo index of preprocessed and original OCR texts. Each page is indexed as a document that is split into 2 sentences long vectors. The model used for sentence embedding is https://huggingface.co/flax-sentence-embeddings/all_datasets_v4_mpnet-base. Code by Michela Vignoli. Parts of this code were developed with assistance from Simon König. """ from pprint import pprint import csv import marqo as mq ## ## Connect to Marqo ## MARQO_URL = "http://10.103.251.104:8882" marqoClient = mq.Client(url=MARQO_URL) #pprint(marqoClient.get_indexes()) ## ## Index settings ## settings = { "textPreprocessing": { "splitLength": 2, "splitOverlap": 0, "splitMethod": "sentence", }, } ## ## Ask if index exists, if not create it ## indexName = "onit-sonnini-DHd2025-clean" print("Indexname: ", indexName) current_indexes = [d["indexName"] for d in marqoClient.get_indexes()["results"]] if indexName in current_indexes: print(f"Index already exists: {indexName} ") # Set indexName as the current index print(f"Defaulting to index connection. Index connected: {indexName} ") else: # Create a new index print(f"Index does not exist: {indexName} ") print(f"Creating index: {indexName} ") marqoClient.create_index( indexName, model="flax-sentence-embeddings/all_datasets_v4_mpnet-base", settings_dict=settings ) ## List of models integrated in Marqo: https://docs.marqo.ai/latest/models/marqo/list-of-models/ pprint(marqoClient.get_indexes()) ## ## Load dict of data ## # Load list of dictionaries with each dictionary containing keys: text, barcode, page # CSV path csv_file = 'data/DHd_index-cleaned.csv' # Read data from CSV file into a list of dictionaries with open(csv_file, mode='r', encoding='utf-8') as file: reader = csv.DictReader(file) animal_descriptions = [row for row in reader] # Function to clean text by replacing \n with spaces def clean_text(text): return text.replace('\n', ' ').strip() # Clean the 'text' field in each dictionary for entry in animal_descriptions: entry['text_orig'] = clean_text(entry['text_orig']) entry['text_clean'] = clean_text(entry['text_clean']) entry['text_prep'] = clean_text(entry['text_prep']) pprint(animal_descriptions[:3]) ## ## Add documents to the index ## print(f"Indexing data...") # Define client_batch_size client_batch_size = 128 # Indexing marqoClient.index(indexName).add_documents( animal_descriptions, client_batch_size=client_batch_size, tensor_fields=["text_clean"], ) print(f"Data has been indexed in {indexName}")