File size: 11,414 Bytes
559c653
 
 
 
 
e62e0c5
 
 
 
c59454b
52d525a
e62e0c5
05202ae
 
099f9b1
ddbc294
e62e0c5
 
 
 
 
 
 
 
 
 
ddbc294
e62e0c5
 
 
 
559c653
 
 
 
 
 
 
 
 
 
 
 
 
 
03eaea3
559c653
 
 
 
 
 
 
 
 
 
 
 
e62e0c5
559c653
 
 
 
 
 
 
 
 
 
 
 
 
 
e62e0c5
 
559c653
 
 
 
 
 
 
 
 
 
 
 
 
e62e0c5
559c653
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e62e0c5
 
 
 
 
 
 
 
 
 
 
 
 
c59454b
 
e62e0c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559c653
e62e0c5
559c653
e62e0c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c59454b
 
a9cd115
ff8bdc8
a3eb292
ff8bdc8
 
559c653
c59454b
e62e0c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a9cd115
e62e0c5
 
 
 
ff8bdc8
 
 
 
e62e0c5
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
"""
Simple Gradio app to preview the preliminary results for retrieving nature representations in imperfect OCR data extracted from 17-19 century German texts in the ONiT project.
Code by Michela Vignoli partially generated with Chat GPT3, GPT4 (free version), and Claude (free version).
"""

# Import packages
import gradio as gr
import pandas as pd
from difflib import SequenceMatcher
import re

# Import results
results_clean = pd.read_csv("data/retrieval_results/sonnini_cleaned/i_onit-sonnini-DHd2025-clean-q_Pferd, Pferde.csv").head(100)
results_prep = pd.read_csv("data/retrieval_results/sonnini_llm_corrected/i_onit-sonnini-DHd2025-prep-q_Pferd, Pferde.csv").head(100)
results_orig = pd.read_csv("data/retrieval_results/sonnini_original_OCR/i_onit-test-index-sonnini-q_Pferd-Pferde.csv").head(100)
annotations = pd.read_csv("data/annotations/DHd2025_referenceReports_annotations_preview_horses.csv")

# Drop 'text_prep' from results_orig
results_clean.drop(columns=['text_prep'], inplace=True)

# Modify the "document" column to remove "_page175.txt" and keep the "Z166069305_00175"
results_orig['document'] = results_orig['document'].str[:-12]

# Modify the "page" column to extract the numeric part and remove leading zeroes
results_orig['page'] = results_orig['page'].str.extract(r'(\d+)', expand=False).astype(int)

data_sources = {"Results Cleaned OCR": results_clean, "Results LLM Preprocessed OCR": results_prep, "Results Original OCR": results_orig, "Annotations": annotations}

# Pagination settings
R = 5  # Number of preview rows per page

def normalize_text(text):
    """Normalize text for better matching by removing extra whitespace and standardizing characters."""
    # Remove extra whitespace
    text = ' '.join(text.split())
    # Could add more normalization steps here if needed
    return text

def find_best_match(needle, haystack):
    """Find the best matching position of needle in haystack using fuzzy matching."""
    matcher = SequenceMatcher(None, needle, haystack)
    matches = matcher.get_matching_blocks()
    
    # Find the best match that exceeds our threshold
    best_match = None
    best_match_ratio = 0.5  # Initialize the best match ratio with our minimum threshold
    
    for match in matches:
        i, j, n = match
        if n > 0:  # Only consider non-zero length matches
            subsequence = haystack[j:j+n]
            ratio = SequenceMatcher(None, needle, subsequence).ratio()
            if ratio > best_match_ratio:
                best_match = (j, j+n)
                best_match_ratio = ratio
    
    return best_match

def highlight_text(text, highlights):
    """
    Highlight specified text segments using fuzzy matching and HTML mark tags.
    
    Args:
        text (str): The original text to highlight
        highlights (str or list): Text segment(s) to highlight
    
    Returns:
        str: Text with highlights wrapped in <mark> tags
    """
    if not text or not highlights:
        return text
    
    # Ensure highlights is a list
    if isinstance(highlights, str):
        highlights = [highlights]
    
    # Remove empty or None highlights
    highlights = [h for h in highlights if h]
    if not highlights:
        return text
    
    # Sort highlights by length (longest first) to avoid nested highlights
    highlights = sorted(highlights, key=len, reverse=True)
    
    # Store positions to highlight
    positions_to_highlight = []
    
    # Find positions for each highlight
    for highlight in highlights:
        normalized_highlight = normalize_text(highlight)
        normalized_text = normalize_text(text)
        
        match = find_best_match(normalized_highlight, normalized_text)
        if match:
            start, end = match
            # Convert positions back to original text
            original_start = len(text[:start].rstrip())
            original_end = original_start + len(text[start:end].strip())
            positions_to_highlight.append((original_start, original_end))
    
    # Sort positions by start position
    positions_to_highlight.sort()
    
    # Apply highlights from end to start to avoid position shifting
    for start, end in reversed(positions_to_highlight):
        text = f"{text[:start]}<mark>{text[start:end]}</mark>{text[end:]}"
    
    return text

# Function to create preview rows
def preview_results(page, selected_data_source):
    data_source = data_sources[selected_data_source]
    start_idx = (page - 1) * R
    end_idx = min(start_idx + R, len(data_source))
    
    results = data_source.iloc[start_idx:end_idx]

    row_elements = []
    for idx, (_, row) in enumerate(results.iterrows(), start=start_idx + 1):
        highlighted_text = row['unpacked_highlights']
        # Highlight "Pferd" and "Pferde" using a span with a yellow background
        highlighted_text = re.sub(r'\b(Pferd\w*)\b', r"<span style='background-color: yellow; font-weight: bold;'>\1</span>", highlighted_text, flags=re.IGNORECASE)
        row_html = f"""
        <div style='border:1px solid #ddd; padding:10px; margin:5px 0; font-size: 18px;'>
            <b>{idx}. \'{row['document']}\'</b> - Score: {row['_score']} - Rank: {row['rank']}
            <br><i>{highlighted_text}</i>
        </div>
        """
        row_elements.append(row_html)

    return "".join(row_elements)

# Function to show details of a selected row
def show_details(document_name, selected_data_source):
    data_source = data_sources[selected_data_source]
    row = data_source[data_source["document"] == document_name]
    
    if row.empty:
        return "<p style='color:red;'>Document not found. Please select a valid document.</p>"

    row = row.iloc[0]  # Extract first matching row
    return f"""
    <div style="display: flex; justify-content: space-between; align-items: start;">
        <div style="width: 65%; font-size: 18px;">
            <h3>📄 Preview: {document_name}</h3>
            <p><b>Retrieved text chunk: </b><i>{row["unpacked_highlights"]}</i></p>
            <p><b>Text on page {row['page']}: </b>{highlight_text(row.get('text_prep') or row.get('text_clean') or row.get('text'), row["unpacked_highlights"])}</p>
            <p><a href="https://digital.onb.ac.at/OnbViewer/viewer.faces?doc=ABO_%2B{row['barcode']}&order={row['page']}&view=SINGLE" target="_blank">🔍 Open ÖNB Viewer</a></p>
        </div>
        <div style="width: 30%; text-align: right;">
            <img src="{row['iiif_link']}" alt="IIIF Image Preview" 
                 style="max-width: 100%; height: auto; border: 1px solid #ddd;">
        </div>
    </div>
    <div style="font-size: 18px;">
        <p><b>Source: </b>C. S. Sonnini's, ehemaligen Offiziers und Jngenieurs des französischen Seewesens <br>und Mitgliedes mehrerer gelehrten und litterarischen Gesellschaften, <br><i>Reisen in Ober= und Niederägypten</i>, Bd. 1. Leipzig/Gera: Wilh. Heinsius, 1800</p>
        <p><b>Citation link: <a href="http://data.onb.ac.at/rep/1058B194"target="_blank">http://data.onb.ac.at/rep/1058B194</a></p>
    </div>
    """

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("""
                ## 🔍 Preview Text Retrieval Results with Marqo Vector Database
                <div style="font-size: 18px;">
                <p><b>Instructions:</b> Browse through the retrieval results for the text prompt <i>"Pferd, Pferde"</i> by sliding the page slider (up to 100 first retrieval results can be inspected). 
                Select the data source: Choose between <i>Results Cleaned OCR, Results LLM Preprocessed OCR, Results Original OCR,</i> and our <i>Annotations</i> of text passages mentioning <i>horses and kindred animals</i> in the text. 
                To visualise details about the retrieved text chunk, copy and paste the document name (e.g. <i>Z166069305_430</i>) in the search bar below and click on the <i>Inspect</i> button. 
                Please note that pressing <i>Enter</i> does not work. 
                To inspect the page in the full book, click on <i>Open ONB Viewer</i> in the document details below.</p>
                </div>""")

    data_source_dropdown = gr.Dropdown(choices=list(data_sources.keys()), label="Select Data Source", value="Results Cleaned OCR")
    page_slider = gr.Slider(1, 1, step=1, label="Page", interactive=True)
    preview_output = gr.HTML()

    gr.Markdown("## 📝 Inspect Document Details")
    
    doc_name_input = gr.Textbox(label="Copy and paste document name to search bar (e.g. Z166069305_430):", interactive=True)
    inspect_button = gr.Button("Inspect")
    inspect_output = gr.HTML()

    # Function to update preview when data source changes
    def update_data_source(selected_data_source):
        max_page = (len(data_sources[selected_data_source]) // R) + 1
        page_slider.maximum = max_page  # Update the max page count dynamically
        return preview_results(1, selected_data_source), 1  # Reset slider to 1

    # Function to update preview when page slider changes
    def update_preview(page, selected_data_source):
        return preview_results(page, selected_data_source)

    # Function to update document details
    def update_details(doc_name, selected_data_source):
        return show_details(doc_name, selected_data_source)

    # Handle data source change
    data_source_dropdown.change(
        update_data_source,
        inputs=[data_source_dropdown], 
        outputs=[preview_output, page_slider]  # Update both preview and reset slider
    )
    # Handle page slider change
    page_slider.change(update_preview, inputs=[page_slider, data_source_dropdown], outputs=[preview_output])

    # Handle inspect button click
    inspect_button.click(update_details, inputs=[doc_name_input, data_source_dropdown], outputs=[inspect_output])

    # Initialize preview with default data source
    preview_output.value = update_data_source("Results Cleaned OCR")
    
    # Further information block at the end
    gr.Markdown("""
    ## 📚 Further Information
    <div style="font-size: 18px;">
        <p>This demo lets you explore our preliminary results for retrieving <i>nature</i> representations in imperfect OCR data extracted from 17-19 century German texts.
        This research was done in the <a href="https://onit.oeaw.ac.at/">Ottoman Nature in Travelogues (ONiT)</a> project and funded by the Austrian Science Fund (FWF: P 35245).
        The text retrieval was done with hybrid vector/lexical search (BM25) by using a <a href="https://docs.marqo.ai/">Marqo</a> 
        vector index. The texts were indexed as one page per document unit, and by splitting them in 2-sentence vectors and embedding them with 
        <a href="https://huggingface.co/flax-sentence-embeddings/all_datasets_v4_mpnet-base">flax-sentence-embeddings/all_datasets_v4_mpnet-base</a> model. 
        <i>Results Cleaned OCR</i> contain the retrieval results for the vectorized OCR texts that were cleaned by using regular expressions. 
        <i>Results LLM Preprocessed OCR</i> contain the retrieval results for the vectorized OCR texts that were automatically corrected with Llama3.1:70b. 
        <i>Results Original OCR</i> contain the retrieval results for the original OCR texts (without any preprocessing).</p>
        <p>For more information, contact <a href="mailto:[email protected]">michela(dot)vignoli(at)ait(dot)ac(dot)at</a>.</p>
    </div>
    """)

demo.launch()