Spaces:
Sleeping
Sleeping
""" | |
Simple Gradio app to preview the preliminary results for retrieving nature representations in imperfect OCR data extracted from 17-19 century German texts in the ONiT project. | |
Code by Michela Vignoli partially generated with Chat GPT3, GPT4 (free version), and Claude (free version). | |
""" | |
# Import packages | |
import gradio as gr | |
import pandas as pd | |
from difflib import SequenceMatcher | |
import re | |
# Import results | |
results_clean = pd.read_csv("data/retrieval_results/sonnini_cleaned/i_onit-sonnini-DHd2025-clean-q_Pferd, Pferde.csv").head(100) | |
results_prep = pd.read_csv("data/retrieval_results/sonnini_llm_corrected/i_onit-sonnini-DHd2025-prep-q_Pferd, Pferde.csv").head(100) | |
results_orig = pd.read_csv("data/retrieval_results/sonnini_original_OCR/i_onit-test-index-sonnini-q_Pferd-Pferde.csv").head(100) | |
annotations = pd.read_csv("data/annotations/DHd2025_referenceReports_annotations_preview_horses.csv") | |
# Drop 'text_prep' from results_orig | |
results_clean.drop(columns=['text_prep'], inplace=True) | |
# Modify the "document" column to remove "_page175.txt" and keep the "Z166069305_00175" | |
results_orig['document'] = results_orig['document'].str[:-12] | |
# Modify the "page" column to extract the numeric part and remove leading zeroes | |
results_orig['page'] = results_orig['page'].str.extract(r'(\d+)', expand=False).astype(int) | |
data_sources = {"Results Cleaned OCR": results_clean, "Results LLM Preprocessed OCR": results_prep, "Results Original OCR": results_orig, "Annotations": annotations} | |
# Pagination settings | |
R = 5 # Number of preview rows per page | |
def normalize_text(text): | |
"""Normalize text for better matching by removing extra whitespace and standardizing characters.""" | |
# Remove extra whitespace | |
text = ' '.join(text.split()) | |
# Could add more normalization steps here if needed | |
return text | |
def find_best_match(needle, haystack): | |
"""Find the best matching position of needle in haystack using fuzzy matching.""" | |
matcher = SequenceMatcher(None, needle, haystack) | |
matches = matcher.get_matching_blocks() | |
# Find the best match that exceeds our threshold | |
best_match = None | |
best_match_ratio = 0.5 # Initialize the best match ratio with our minimum threshold | |
for match in matches: | |
i, j, n = match | |
if n > 0: # Only consider non-zero length matches | |
subsequence = haystack[j:j+n] | |
ratio = SequenceMatcher(None, needle, subsequence).ratio() | |
if ratio > best_match_ratio: | |
best_match = (j, j+n) | |
best_match_ratio = ratio | |
return best_match | |
def highlight_text(text, highlights): | |
""" | |
Highlight specified text segments using fuzzy matching and HTML mark tags. | |
Args: | |
text (str): The original text to highlight | |
highlights (str or list): Text segment(s) to highlight | |
Returns: | |
str: Text with highlights wrapped in <mark> tags | |
""" | |
if not text or not highlights: | |
return text | |
# Ensure highlights is a list | |
if isinstance(highlights, str): | |
highlights = [highlights] | |
# Remove empty or None highlights | |
highlights = [h for h in highlights if h] | |
if not highlights: | |
return text | |
# Sort highlights by length (longest first) to avoid nested highlights | |
highlights = sorted(highlights, key=len, reverse=True) | |
# Store positions to highlight | |
positions_to_highlight = [] | |
# Find positions for each highlight | |
for highlight in highlights: | |
normalized_highlight = normalize_text(highlight) | |
normalized_text = normalize_text(text) | |
match = find_best_match(normalized_highlight, normalized_text) | |
if match: | |
start, end = match | |
# Convert positions back to original text | |
original_start = len(text[:start].rstrip()) | |
original_end = original_start + len(text[start:end].strip()) | |
positions_to_highlight.append((original_start, original_end)) | |
# Sort positions by start position | |
positions_to_highlight.sort() | |
# Apply highlights from end to start to avoid position shifting | |
for start, end in reversed(positions_to_highlight): | |
text = f"{text[:start]}<mark>{text[start:end]}</mark>{text[end:]}" | |
return text | |
# Function to create preview rows | |
def preview_results(page, selected_data_source): | |
data_source = data_sources[selected_data_source] | |
start_idx = (page - 1) * R | |
end_idx = min(start_idx + R, len(data_source)) | |
results = data_source.iloc[start_idx:end_idx] | |
row_elements = [] | |
for idx, (_, row) in enumerate(results.iterrows(), start=start_idx + 1): | |
highlighted_text = row['unpacked_highlights'] | |
# Highlight "Pferd" and "Pferde" using a span with a yellow background | |
highlighted_text = re.sub(r'\b(Pferd\w*)\b', r"<span style='background-color: yellow; font-weight: bold;'>\1</span>", highlighted_text, flags=re.IGNORECASE) | |
row_html = f""" | |
<div style='border:1px solid #ddd; padding:10px; margin:5px 0; font-size: 18px;'> | |
<b>{idx}. \'{row['document']}\'</b> - Score: {row['_score']} - Rank: {row['rank']} | |
<br><i>{highlighted_text}</i> | |
</div> | |
""" | |
row_elements.append(row_html) | |
return "".join(row_elements) | |
# Function to show details of a selected row | |
def show_details(document_name, selected_data_source): | |
data_source = data_sources[selected_data_source] | |
row = data_source[data_source["document"] == document_name] | |
if row.empty: | |
return "<p style='color:red;'>Document not found. Please select a valid document.</p>" | |
row = row.iloc[0] # Extract first matching row | |
return f""" | |
<div style="display: flex; justify-content: space-between; align-items: start;"> | |
<div style="width: 65%; font-size: 18px;"> | |
<h3>📄 Preview: {document_name}</h3> | |
<p><b>Retrieved text chunk: </b><i>{row["unpacked_highlights"]}</i></p> | |
<p><b>Text on page {row['page']}: </b>{highlight_text(row.get('text_prep') or row.get('text_clean') or row.get('text'), row["unpacked_highlights"])}</p> | |
<p><a href="https://digital.onb.ac.at/OnbViewer/viewer.faces?doc=ABO_%2B{row['barcode']}&order={row['page']}&view=SINGLE" target="_blank">🔍 Open ÖNB Viewer</a></p> | |
</div> | |
<div style="width: 30%; text-align: right;"> | |
<img src="{row['iiif_link']}" alt="IIIF Image Preview" | |
style="max-width: 100%; height: auto; border: 1px solid #ddd;"> | |
</div> | |
</div> | |
<div style="font-size: 18px;"> | |
<p><b>Source: </b>C. S. Sonnini's, ehemaligen Offiziers und Jngenieurs des französischen Seewesens <br>und Mitgliedes mehrerer gelehrten und litterarischen Gesellschaften, <br><i>Reisen in Ober= und Niederägypten</i>, Bd. 1. Leipzig/Gera: Wilh. Heinsius, 1800</p> | |
<p><b>Citation link: <a href="http://data.onb.ac.at/rep/1058B194"target="_blank">http://data.onb.ac.at/rep/1058B194</a></p> | |
</div> | |
""" | |
# Gradio Interface | |
with gr.Blocks() as demo: | |
gr.Markdown(""" | |
## 🔍 Preview Text Retrieval Results with Marqo Vector Database | |
<div style="font-size: 18px;"> | |
<p><b>Instructions:</b> Browse through the retrieval results for the text prompt <i>"Pferd, Pferde"</i> by sliding the page slider (up to 100 first retrieval results can be inspected). | |
Select the data source: Choose between <i>Results Cleaned OCR, Results LLM Preprocessed OCR, Results Original OCR,</i> and our <i>Annotations</i> of text passages mentioning <i>horses and kindred animals</i> in the text. | |
To visualise details about the retrieved text chunk, copy and paste the document name (e.g. <i>Z166069305_430</i>) in the search bar below and click on the <i>Inspect</i> button. | |
Please note that pressing <i>Enter</i> does not work. | |
To inspect the page in the full book, click on <i>Open ONB Viewer</i> in the document details below.</p> | |
</div>""") | |
data_source_dropdown = gr.Dropdown(choices=list(data_sources.keys()), label="Select Data Source", value="Results Cleaned OCR") | |
page_slider = gr.Slider(1, 1, step=1, label="Page", interactive=True) | |
preview_output = gr.HTML() | |
gr.Markdown("## 📝 Inspect Document Details") | |
doc_name_input = gr.Textbox(label="Copy and paste document name to search bar (e.g. Z166069305_430):", interactive=True) | |
inspect_button = gr.Button("Inspect") | |
inspect_output = gr.HTML() | |
# Function to update preview when data source changes | |
def update_data_source(selected_data_source): | |
max_page = (len(data_sources[selected_data_source]) // R) + 1 | |
page_slider.maximum = max_page # Update the max page count dynamically | |
return preview_results(1, selected_data_source), 1 # Reset slider to 1 | |
# Function to update preview when page slider changes | |
def update_preview(page, selected_data_source): | |
return preview_results(page, selected_data_source) | |
# Function to update document details | |
def update_details(doc_name, selected_data_source): | |
return show_details(doc_name, selected_data_source) | |
# Handle data source change | |
data_source_dropdown.change( | |
update_data_source, | |
inputs=[data_source_dropdown], | |
outputs=[preview_output, page_slider] # Update both preview and reset slider | |
) | |
# Handle page slider change | |
page_slider.change(update_preview, inputs=[page_slider, data_source_dropdown], outputs=[preview_output]) | |
# Handle inspect button click | |
inspect_button.click(update_details, inputs=[doc_name_input, data_source_dropdown], outputs=[inspect_output]) | |
# Initialize preview with default data source | |
preview_output.value = update_data_source("Results Cleaned OCR") | |
# Further information block at the end | |
gr.Markdown(""" | |
## 📚 Further Information | |
<div style="font-size: 18px;"> | |
<p>This demo lets you explore our preliminary results for retrieving <i>nature</i> representations in imperfect OCR data extracted from 17-19 century German texts. | |
This research was done in the <a href="https://onit.oeaw.ac.at/">Ottoman Nature in Travelogues (ONiT)</a> project and funded by the Austrian Science Fund (FWF: P 35245). | |
The text retrieval was done with hybrid vector/lexical search (BM25) by using a <a href="https://docs.marqo.ai/">Marqo</a> | |
vector index. The texts were indexed as one page per document unit, and by splitting them in 2-sentence vectors and embedding them with | |
<a href="https://huggingface.co/flax-sentence-embeddings/all_datasets_v4_mpnet-base">flax-sentence-embeddings/all_datasets_v4_mpnet-base</a> model. | |
<i>Results Cleaned OCR</i> contain the retrieval results for the vectorized OCR texts that were cleaned by using regular expressions. | |
<i>Results LLM Preprocessed OCR</i> contain the retrieval results for the vectorized OCR texts that were automatically corrected with Llama3.1:70b. | |
<i>Results Original OCR</i> contain the retrieval results for the original OCR texts (without any preprocessing).</p> | |
<p>For more information, contact <a href="mailto:[email protected]">michela(dot)vignoli(at)ait(dot)ac(dot)at</a>.</p> | |
</div> | |
""") | |
demo.launch() |