# Analysis of Retrieval Results

## Load annotations and retrieval results

In [1]:
import pandas as pd
import os
import re

text_annotations_path = "data/annotations/"
filename = 'DHd2025_referenceReports_merged-cleaned__annotations.xlsx'

annotations = pd.read_excel(os.path.join(text_annotations_path, filename))

## Remove unwanted tags from list
unwanted_strings = ['pages', 'subheading', 'footnote', 'table of contents', 'footnote irrelevant', 'title', 'scan mistake', 'dictionary']

# Remove rows where 'TAGS' contains any unwanted string
annotations = annotations[~annotations['TAGS'].str.contains('|'.join(unwanted_strings), case=False, na=False)].reset_index(drop=True)
annotations = annotations[annotations['FILE'].str.contains('Z166069305', case=False, na=False)]

annotations

Unnamed: 0,UUID,FILE,QUOTE_TRANSCRIPTION,ANCHOR,COMMENTS,TAGS,PAGE,TRANSCRIPTION_CLEANED
551,2faf6dce-7806-48b6-8e8b-2bf926fc88fc,Z166069305_merged.txt,Negypten aber hat nicht allein Wunder der Kunſ...,char-offset:2567,p. 8,'plants vegetation 25G',8,('n? Negypten aber hat nicht allein Wunder der...
552,2e026f81-cbe7-43ba-9b37-5f5bd2760119,Z166069305_merged.txt,Barbaren bewohnen jezt das Land und Räuber beh...,char-offset:3391,p. 8,'plants vegetation 25G',8,('r. Barbaren bewohnen jezt das Land und Raube...
553,0d5708db-1bf6-4de4-ad64-9759ebb4fa3c,Z166069305_merged.txt,Die Waffen eines Mamefu den zu Pferde find zwe...,char-offset:5538,p. 9-10\n,"'animals 25F', 'horses and kindred animals 46C...",10,('n. Die Waffen eines Mamefu den zu Pferde fin...
554,64b5828e-b14d-49dc-8d88-f97de52f9202,Z166069305_merged.txt,Den Zaum reines Pferdes nimmt er zwiſchen die ...,char-offset:5987,p. 10\n,"'animals 25F', 'horses and kindred animals 46C...",10,('l. Den Zaum reines Pferdes nimmt er zwischen...
555,fa2e8b0d-d8aa-46c8-ac97-1a8e2b14600d,Z166069305_merged.txt,"Seine Häuſer ſind arms felige Kothhütten, und ...",char-offset:8769,p. 11-12,'plants vegetation 25G',12,"(' Hauser sind arms felige Kothhutten, und fei..."
...,...,...,...,...,...,...,...,...
1117,3d4a21ad-ddd8-4d3a-b018-d7dbb9f186dc,Z166069305_merged.txt,"Bittt adit2: 001), ont ma Des Abends begaben ſ...",char-offset:821475,,"'animals 25F', 'birds 25F3'",504,"(""00 Bittt adit2: 001), ont ma Des Abends bega..."
1118,2aefbba4-e796-4add-bdb1-0768afe0a3ce,Z166069305_merged.txt,57 3 Chazelle lieb Chazelles,char-offset:823734,,"'animals 25F', 'hoofed animalGAZELLE 25F24'",504,"('ue 57 3 Chazelle lieb Chazelles 65', 814073)"
1119,f50b57c6-d751-4d41-8670-8b5254d964ec,Z166069305_merged.txt,"Ein' to svil, dulu UPUN Fahn irlichin( jiyi vo...",char-offset:824751,,'animals 25F',504,"(""c. Ein' to suil, dulu UPUN Fahn irlichin( ji..."
1120,d8cfcc30-5f00-4681-826b-c5c7a87e768b,Z166069305_merged.txt,"Dez HENNÈI, Blüthen. Eine Blüthe ron natiiitic...",char-offset:824901,,'plants vegetation 25G',504,"('-- Dez HENNEI, Bluthen. Eine Bluthe ron nati..."


In [2]:
## Filter the DataFrame
filtered_df = annotations[annotations['TAGS'].str.contains('horse', na=False)]
# Count the number of rows before filtering short sentences
before_filter_count = len(filtered_df)

# Filter sentences that are 5 words or longer in 'QUOTE_TRANSCRIPTION'
filtered_df = filtered_df[filtered_df['TRANSCRIPTION_CLEANED'].apply(lambda x: len(x.split()) >= 5)]
# Count the number of rows after filtering short sentences
after_filter_count = len(filtered_df)
excluded_count = before_filter_count - after_filter_count
print(f"Number of excluded sentences: {excluded_count}")

mentions_dict = filtered_df.set_index('UUID').to_dict(orient='index')

mentions = [{"id": key, "annotated_text": v['TRANSCRIPTION_CLEANED'], "comment": v['COMMENTS'], "page": v['PAGE'], "file": v['FILE']} for key, v in mentions_dict.items()]

print(f"Number of annotated mentions: {len(filtered_df)}")
mentions[0]

Number of excluded sentences: 0
Number of annotated mentions: 48


{'id': '0d5708db-1bf6-4de4-ad64-9759ebb4fa3c',
 'annotated_text': "('n. Die Waffen eines Mamefu den zu Pferde find zwei --- 00010_page10_cleaned.txt --- grose Flinten, die ihm seine Diener zur Seite nach: tragen und die er ein einziges mal losschiest. Da', 5402)",
 'comment': 'p. 9-10\n',
 'page': 10,
 'file': 'Z166069305_merged.txt'}

In [3]:
## Summary of annotations

# Total annotations:
# Remove duplicates based on "page"
unique_annot = {entry["id"]: entry for entry in mentions}.values()

# Remove duplicates based on "page"
unique_pages_annot = {entry["page"]: entry for entry in mentions}.values()

# Extract the "page" value from each dictionary of unique pages found
unique_pages_list = [item['page'] for item in unique_pages_annot]

print(f'Total annotations: {len(unique_annot)}')
print(f'On total pages: {len(unique_pages_annot)}')

Total annotations: 48
On total pages: 40


In [5]:
## Load retrieval results with Marqo DHd2025

## Pferd, Pferde
retr_orig = pd.read_csv('data/retrieval_results/sonnini_original_OCR/i_onit-test-index-sonnini-q_Pferd-Pferde.csv')
retr_prep = pd.read_csv('data/retrieval_results/sonnini_llm_corrected/i_onit-sonnini-DHd2025-prep-q_Pferd, Pferde.csv')
retr_clean = pd.read_csv('data/retrieval_results/sonnini_cleaned/i_onit-sonnini-DHd2025-clean-q_Pferd, Pferde.csv')

## Evaluation of results

In [7]:
## Analyse overlap of retrieved texts with annotated texts

"""
In this notebook, we attempted to compare the heterogeneous texts that were annotated during the project
with the retrieval results by comparing n-word phrases from the annotated sentences with the retrieved
sentence vectors. This approach is experimental. An alternative evaluation by calculating the Levenshtein
distance between the annotated text vectors and the retrieved text vectors will be done in future.
"""

## Create dict from retrieved texts
#retr = retr_orig.head(200)
#retr = retr_prep.head(200)
retr = retr_clean.head(200)

retrieved_dict = retr.set_index('_id').to_dict(orient='index')

retrieved = [
    {
        "id": key,
        "text_document": v['text_clean'], # 'text', 'text_clean'
        "text_vector": v['unpacked_highlights'] if pd.notna(v['unpacked_highlights']) else v['text'],
        "page": v['page'], #int((re.search(r'\d+', v['page'])).group()),
        "rerank": v['rerank'],
        "barcode": v['barcode'],
        "onb_link": v['onb_viewer_link']
    }
    for key, v in retrieved_dict.items()
]

print(len(retrieved))
retrieved[0]

200


{'id': '01b71d37-9ead-4eb2-8504-d16cbd00e866',
 'text_document': 'befand mich in einiger Entfernung davon, und der Ueber- rest unserer Begleitung folgte in ziemlicher Weite nach. Ein Trupp Beduinen zu Pferde. brach auf einmal hinter den Mauern hervor. Ich konnte sie an. fanglich unter den Staubwolken, die sie erregten, nicht unterscheiden: als fie fich aber ausgebreitet hatten, erfannte ich sowohl die Ort als die Anzahl von Leuten, mit denen wir zu thun haben sollten. Ich wandte sogleich inein Pferd um, und mit diefem vortreflichen Reuter, auf welchem man mich nicht hatte einbohlen konnen, hatte ich bald meine Gefahrten erreicht, die auch von ihren Came len herab diese. Reuterei gewahr worden waren. Ich fand sie zu Fus und in geschlossene Glieder gestellt. Ich sturzte mich von meinem Pferde herab und munterte fie zu einenu muthigen Widerstande auf. Wir waren im Ganzen rechs Personen, unter welchen wir nur auf drei | rechnen konnten. Zwei Eingebohrne fonnten nicht viel helfen, und der Z

In [8]:
## Function to generate n-word phrases from a sentence
def generate_n_word_phrases(sentence, n):
    words = sentence.lower().split()
    return [' '.join(words[i:i+n]) for i in range(len(words) - n + 1)]

## Function to check if any of the n-word phrases is in the text_vector
## and store the first matching phrase
def check_sentence_in_text_vector(sentence, text_vector, n):
    text_vector_lower = text_vector.lower()
    phrases = generate_n_word_phrases(sentence, n)
    for phrase in phrases:
        if phrase in text_vector_lower:
            return True, phrase  # Return True and the matching phrase
    return False, None  # No match found, return None
    #return any(phrase in text_vector_lower for phrase in phrases)

# Number of words in the phrase to check
n = 5  # Change this to the desired number of words

# List to store results
results = []

# Iterate through each sentence in the mentions list
for sentence in mentions:
    found = False
    # Check if any n-word phrase from this sentence is present in any of the text_vector fields
    for entry in retrieved:
        #print(entry['text_vector'])
        match_found, match = check_sentence_in_text_vector(sentence['annotated_text'], entry['text_vector'], n)
        if match_found == True:
            found = True
            break  # No need to check further if a match is found
    if found == True:
        results.append({
            "sentence_annotation": sentence['annotated_text'],
            "id_annotation": sentence['id'],
            "page_annotation": sentence['page'],
            "comment": sentence['comment'],
            "found_in_text_vector": found,
            "text_vector": entry['text_vector'],
            "matching_phrase": match,
            "id_text_vector": entry['id'],
            "barcode": entry['barcode'],
            "page_text": entry['page'],
            "onb_link": entry['onb_link'],
            "file_annotation": sentence['file']
            })
    else:
        results.append({
            "sentence_annotation": sentence['annotated_text'],
            "id_annotation": sentence['id'],
            "page_annotation": sentence['page'],
            "file_annotation": sentence['file'],
            "found_in_text_vector": found
            })

retrieved_df = pd.DataFrame(retrieved)
retrieved_df.rename(columns={'id': 'id_text_vector'}, inplace=True)
mat_df = pd.DataFrame(results)
matches_df = retrieved_df.merge(mat_df, on='id_text_vector', how='outer')

# Sort the DataFrame based on the 'rrf_rank' column in ascending order
matches_df = matches_df.sort_values(by='rerank', ascending=True)

In [9]:
# Filter rows where 'found_in_text_vector' is True --> True Positives
found = matches_df[matches_df['found_in_text_vector'] == True]
found_fil = found['id_annotation'].drop_duplicates(keep='first')
found_pages = found['page'].drop_duplicates(keep='first')

# Extract the "page" value from each dictionary of unique pages found
unique_foundPages_list = found_pages.to_list()

# Filter rows where the absolute difference between 'page_annotation' and 'page_text' is less than or equal to 4
#found = found[abs(found['page_annotation'] - found['page_text']) <= 6]

print(f"True Positives: Found {len(found_fil)} relevant mentions on {len(unique_foundPages_list)} pages of {len(unique_annot)} relevant mentions on {len(unique_pages_list)} pages")
print(f"({len(unique_foundPages_list)/len(unique_pages_list)*100:.2f}%).")
print(f"Found average of {len(found_fil)/len(found_pages):.2f} found mentions per page vs. GT of {len(unique_annot)/len(unique_pages_annot):.2f} annotations per page.")

found[['sentence_annotation', 'text_vector_x', 'rerank', 'id_annotation', 'page', 'page_annotation', 'file_annotation']]

True Positives: Found 27 relevant mentions on 27 pages of 48 relevant mentions on 40 pages
(67.50%).
Found average of 1.00 found mentions per page vs. GT of 1.20 annotations per page.


Unnamed: 0,sentence_annotation,text_vector_x,rerank,id_annotation,page,page_annotation,file_annotation
0,('h. Ein Trupp Beduinen zu Pferde. brach auf e...,"befand mich in einiger Entfernung davon, und d...",1.0,ffbabe61-0a21-4e4f-b6e4-2ddc4724b40c,430.0,434.0,Z166069305_merged.txt
1,('n. Man muntert beide auch mit einem Klatsche...,Man muntert beide auch mit einem Klatschen der...,2.0,9a41db07-a5c8-4b3f-931a-43418f800768,399.0,403.0,Z166069305_merged.txt
2,('t. Ein kriegerisches und eroberungssuchtiges...,"Eine weife Nation hingegen, die den Ackerbau f...",3.0,2caacee3-c22d-4a76-a2d7-ba794856a103,175.0,176.0,Z166069305_merged.txt
3,('n. Die Waffen eines Mamefu den zu Pferde fin...,Die Waffen eines Mamefu den zu Pferde find zwei,4.0,0d5708db-1bf6-4de4-ad64-9759ebb4fa3c,9.0,10.0,Z166069305_merged.txt
4,('r. Das heise Clima dieser Gegenden des Alter...,"Das Pferd, das eben fo feurig ist als die Luft...",5.0,4f7c0172-b25f-4e62-9cb5-a1c1213c6d7d,220.0,222.0,Z166069305_merged.txt
5,"(""n. Diese Banditen waren nicht damit zufriede...",Andere Uraber erwiesen meinen Gefahrten diesel...,6.0,c86544f5-33f0-4d9a-b048-4995ae36c493,435.0,439.0,Z166069305_merged.txt
6,('s. ?. Wir fahen zwei Beduinen zu Pferde; fie...,?. Wir fahen zwei Beduinen zu Pferde; fie floh...,7.0,3a1cf968-2b82-48d9-a784-d3a7f5198890,413.0,417.0,Z166069305_merged.txt
7,"(""b. Der Hraber muste mir fur den taglichen Pr...",Der Hraber muste mir fur den taglichen Preis v...,8.0,b0450149-d074-4079-87c4-cdc252ce7c67,383.0,387.0,Z166069305_merged.txt
8,"('r. Ich war mit meinem Pferde uorausgeritten,...","Ich war mit meinem Pferde vorausgeritten, und ...",9.0,a16c02e9-16a6-4780-b65e-51510b8db6fb,406.0,410.0,Z166069305_merged.txt
9,('-- Das Lager stand etwan einige hundert Fus ...,Man bauete verschiedente Arten Viehfutter dara...,10.0,b8a8f0ea-926e-4d89-b21d-e89e69a35c2f,394.0,397.0,Z166069305_merged.txt


In [10]:
# Filter rows where 'found_in_text_vector' is False --> False Negatives
missed = matches_df[matches_df['found_in_text_vector'] == False]
missed_fil = missed['id_annotation'].drop_duplicates(keep='first')
missed_pages = missed['page_annotation'].drop_duplicates(keep='first')

# Filter out pages that are in unique pages found
missed_pages_unique = missed_pages[~missed_pages.isin(unique_foundPages_list)]
retrieved_pages_otherVec = missed_pages[missed_pages.isin(unique_foundPages_list)]

print(f"False Negatives: Missed {len(missed_fil)} mentions on {len(missed_pages_unique)} pages, of which {len(retrieved_pages_otherVec)} pages were retrieved with another vector.")
print(f"({(len(missed_pages_unique))/len(unique_pages_list)*100:.2f}%).")

missed[['sentence_annotation', 'id_annotation', 'page_annotation', 'file_annotation']]

False Negatives: Missed 21 mentions on 20 pages, of which 0 pages were retrieved with another vector.
(50.00%).


Unnamed: 0,sentence_annotation,id_annotation,page_annotation,file_annotation
200,('r. Wir giengen in geschloffenen Reihen und m...,a406cecb-48dc-4b27-9e2e-e831301a59bc,149.0,Z166069305_merged.txt
201,"('s. Da die Wagen nicht gewohnlich sind, so be...",53e38871-8353-4ea1-acc5-e60964d4b02d,151.0,Z166069305_merged.txt
202,('t. Man tritt mit den Fusen allerhand Muschel...,32cc068f-6a60-4040-aa4a-0575f48b5c52,155.0,Z166069305_merged.txt
203,"('r. Sie kennen kein ander Spaziergehen, --- 0...",638cd5aa-72b7-46a0-bbf1-82ec0f884747,186.0,Z166069305_merged.txt
204,"('s. In dem Departement, wo ich wohne und in d...",8438c431-385f-40f0-b167-51527c2c8e7e,221.0,Z166069305_merged.txt
205,"('t. Ich wuste, das er besucht zu werden uerdi...",41bd83f1-06ec-4186-af2e-63dea256dfc8,267.0,Z166069305_merged.txt
206,('n. Sie hatten in ihrem Hause ein Paar rothe ...,e5ebef45-e36c-4742-9976-702dfad43fc5,270.0,Z166069305_merged.txt
207,('e. Nach diesem Kaufe nahmen wir uon dem gute...,a02a872b-8602-4ff4-93bd-f539ff93ada6,276.0,Z166069305_merged.txt
208,"('n. Der Viceconsul, rein Drogman und ein fran...",a5b12bfb-3a78-46ac-8d62-73d5bf4425c9,277.0,Z166069305_merged.txt
209,"('n. Unsere Esel Fielen nieder, sanken in den ...",ceed7937-1824-4222-ba6c-686b2b4d7738,278.0,Z166069305_merged.txt


In [11]:
# Filter rows where 'found_in_text_vector' is NaN --> False Positives
FP = matches_df[matches_df['found_in_text_vector'].isna()]
fP_pages = FP['page'].drop_duplicates(keep='first')

# Extract the "page" value from each dictionary of unique pages found
unique_FPPages_list = fP_pages.to_list()

# Filter out pages that are in unique pages found
ret_pages_Annot = fP_pages[fP_pages.isin(unique_pages_list)]
ret_pages_noAnnot = fP_pages[~fP_pages.isin(unique_pages_list)]
retrieved_otherVec = fP_pages[fP_pages.isin(unique_foundPages_list)]

print(f"Retrieved {len(FP)} vectors on {len(unique_FPPages_list)} pages of which {len(ret_pages_Annot)} pages do contain annotations (correct pages, but other vector retrieved).")
print(f"and of which {len(ret_pages_noAnnot)} pages do not contain annotations (= False Positives)")
print(f"({len(ret_pages_noAnnot)/len(unique_pages_list)*100:.2f})")
print(f"and of which {len(retrieved_otherVec)} pages were retrieved with other vectors (= True Negatives).")
FP[['text_vector_x', 'rerank', 'id_text_vector', 'page', 'page_annotation', 'file_annotation']]

Retrieved 173 vectors on 173 pages of which 11 pages do contain annotations (correct pages, but other vector retrieved).
and of which 162 pages do not contain annotations (= False Positives)
(405.00)
and of which 0 pages were retrieved with other vectors (= True Negatives).


Unnamed: 0,text_vector_x,rerank,id_text_vector,page,page_annotation,file_annotation
16,"Unter einer Regierung, die nur die Absicht bat...",17.0,9ee8949a-e0cc-4334-bc36-b71becd2d5c7,400.0,,
23,feiner Neise 1. th.).,24.0,39e2e422-b7c3-40c1-90ee-12718c603261,393.0,,
27,**) Ursprung der Gefese 2. Bd.,28.0,4ff5ac89-e2f3-468f-bad6-712ecce32e54,170.0,,
28,"Von frih morgens an, bis auf den Abend, hat ma...",29.0,d4357d62-9022-4b84-803b-e634b791a923,182.0,,
29,"Das Wasser, das * Strix passerina, Lin, Chevec...",30.0,7aaae2e6-1e90-46a2-9f05-fe28c7b2842b,236.0,,
...,...,...,...,...,...,...
195,"fchichte uberhaupt, und jene des. Mungo insbes...",196.0,f8995776-d337-413e-bc6b-d54a1fb5ff86,223.0,,
196,"Ob sie gleich gesehen hatten, was seit Morgens...",197.0,3315a574-24e7-4a73-b660-bd151a27956b,437.0,,
197,Die fehr oftern Uncinigkeiten zwischen den Beh...,198.0,bd45fa07-bd6a-46f3-9ba9-516d48d55fb5,161.0,,
198,"Es ist bemerkenswerth, das man diesen unruhige...",199.0,736f2f9e-85b2-470d-a94e-f34f64db4189,89.0,,
