""" Simple Gradio app to preview the preliminary results for retrieving nature representations in imperfect OCR data extracted from 17-19 century German texts in the ONiT project. Code by Michela Vignoli partially generated with Chat GPT3, GPT4 (free version), and Claude (free version). """ # Import packages import gradio as gr import pandas as pd from difflib import SequenceMatcher import re # Import results results_clean = pd.read_csv("data/retrieval_results/sonnini_cleaned/i_onit-sonnini-DHd2025-clean-q_Pferd, Pferde.csv").head(100) results_prep = pd.read_csv("data/retrieval_results/sonnini_llm_corrected/i_onit-sonnini-DHd2025-prep-q_Pferd, Pferde.csv").head(100) results_orig = pd.read_csv("data/retrieval_results/sonnini_original_OCR/i_onit-test-index-sonnini-q_Pferd-Pferde.csv").head(100) annotations = pd.read_csv("data/annotations/DHd2025_referenceReports_annotations_preview_horses.csv") # Drop 'text_prep' from results_orig results_clean.drop(columns=['text_prep'], inplace=True) # Modify the "document" column to remove "_page175.txt" and keep the "Z166069305_00175" results_orig['document'] = results_orig['document'].str[:-12] # Modify the "page" column to extract the numeric part and remove leading zeroes results_orig['page'] = results_orig['page'].str.extract(r'(\d+)', expand=False).astype(int) data_sources = {"Results Cleaned OCR": results_clean, "Results LLM Preprocessed OCR": results_prep, "Results Original OCR": results_orig, "Annotations": annotations} # Pagination settings R = 5 # Number of preview rows per page def normalize_text(text): """Normalize text for better matching by removing extra whitespace and standardizing characters.""" # Remove extra whitespace text = ' '.join(text.split()) # Could add more normalization steps here if needed return text def find_best_match(needle, haystack): """Find the best matching position of needle in haystack using fuzzy matching.""" matcher = SequenceMatcher(None, needle, haystack) matches = matcher.get_matching_blocks() # Find the best match that exceeds our threshold best_match = None best_match_ratio = 0.5 # Initialize the best match ratio with our minimum threshold for match in matches: i, j, n = match if n > 0: # Only consider non-zero length matches subsequence = haystack[j:j+n] ratio = SequenceMatcher(None, needle, subsequence).ratio() if ratio > best_match_ratio: best_match = (j, j+n) best_match_ratio = ratio return best_match def highlight_text(text, highlights): """ Highlight specified text segments using fuzzy matching and HTML mark tags. Args: text (str): The original text to highlight highlights (str or list): Text segment(s) to highlight Returns: str: Text with highlights wrapped in tags """ if not text or not highlights: return text # Ensure highlights is a list if isinstance(highlights, str): highlights = [highlights] # Remove empty or None highlights highlights = [h for h in highlights if h] if not highlights: return text # Sort highlights by length (longest first) to avoid nested highlights highlights = sorted(highlights, key=len, reverse=True) # Store positions to highlight positions_to_highlight = [] # Find positions for each highlight for highlight in highlights: normalized_highlight = normalize_text(highlight) normalized_text = normalize_text(text) match = find_best_match(normalized_highlight, normalized_text) if match: start, end = match # Convert positions back to original text original_start = len(text[:start].rstrip()) original_end = original_start + len(text[start:end].strip()) positions_to_highlight.append((original_start, original_end)) # Sort positions by start position positions_to_highlight.sort() # Apply highlights from end to start to avoid position shifting for start, end in reversed(positions_to_highlight): text = f"{text[:start]}{text[start:end]}{text[end:]}" return text # Function to create preview rows def preview_results(page, selected_data_source): data_source = data_sources[selected_data_source] start_idx = (page - 1) * R end_idx = min(start_idx + R, len(data_source)) results = data_source.iloc[start_idx:end_idx] row_elements = [] for idx, (_, row) in enumerate(results.iterrows(), start=start_idx + 1): highlighted_text = row['unpacked_highlights'] # Highlight "Pferd" and "Pferde" using a span with a yellow background highlighted_text = re.sub(r'\b(Pferd\w*)\b', r"\1", highlighted_text, flags=re.IGNORECASE) row_html = f"""

{idx}. \'{row['document']}\' - Score: {row['_score']} - Rank: {row['rank']}
{highlighted_text}

""" row_elements.append(row_html) return "".join(row_elements) # Function to show details of a selected row def show_details(document_name, selected_data_source): data_source = data_sources[selected_data_source] row = data_source[data_source["document"] == document_name] if row.empty: return "

Document not found. Please select a valid document.

" row = row.iloc[0] # Extract first matching row return f"""

📄 Preview: {document_name}

Retrieved text chunk: {row["unpacked_highlights"]}

Text on page {row['page']}: {highlight_text(row.get('text_prep') or row.get('text_clean') or row.get('text'), row["unpacked_highlights"])}

🔍 Open ÖNB Viewer

IIIF Image Preview

Source: C. S. Sonnini's, ehemaligen Offiziers und Jngenieurs des französischen Seewesens
und Mitgliedes mehrerer gelehrten und litterarischen Gesellschaften,
Reisen in Ober= und Niederägypten, Bd. 1. Leipzig/Gera: Wilh. Heinsius, 1800

Citation link: http://data.onb.ac.at/rep/1058B194

""" # Gradio Interface with gr.Blocks() as demo: gr.Markdown(""" ## 🔍 Preview Text Retrieval Results with Marqo Vector Database

Instructions: Browse through the retrieval results for the text prompt "Pferd, Pferde" by sliding the page slider (up to 100 first retrieval results can be inspected). Select the data source: Choose between Results Cleaned OCR, Results LLM Preprocessed OCR, Results Original OCR, and our Annotations of text passages mentioning horses and kindred animals in the text. To visualise details about the retrieved text chunk, copy and paste the document name (e.g. Z166069305_430) in the search bar below and click on the Inspect button. Please note that pressing Enter does not work. To inspect the page in the full book, click on Open ONB Viewer in the document details below.

""") data_source_dropdown = gr.Dropdown(choices=list(data_sources.keys()), label="Select Data Source", value="Results Cleaned OCR") page_slider = gr.Slider(1, 1, step=1, label="Page", interactive=True) preview_output = gr.HTML() gr.Markdown("## 📝 Inspect Document Details") doc_name_input = gr.Textbox(label="Copy and paste document name to search bar (e.g. Z166069305_430):", interactive=True) inspect_button = gr.Button("Inspect") inspect_output = gr.HTML() # Function to update preview when data source changes def update_data_source(selected_data_source): max_page = (len(data_sources[selected_data_source]) // R) + 1 page_slider.maximum = max_page # Update the max page count dynamically return preview_results(1, selected_data_source), 1 # Reset slider to 1 # Function to update preview when page slider changes def update_preview(page, selected_data_source): return preview_results(page, selected_data_source) # Function to update document details def update_details(doc_name, selected_data_source): return show_details(doc_name, selected_data_source) # Handle data source change data_source_dropdown.change( update_data_source, inputs=[data_source_dropdown], outputs=[preview_output, page_slider] # Update both preview and reset slider ) # Handle page slider change page_slider.change(update_preview, inputs=[page_slider, data_source_dropdown], outputs=[preview_output]) # Handle inspect button click inspect_button.click(update_details, inputs=[doc_name_input, data_source_dropdown], outputs=[inspect_output]) # Initialize preview with default data source preview_output.value = update_data_source("Results Cleaned OCR") # Further information block at the end gr.Markdown(""" ## 📚 Further Information

This demo lets you explore our preliminary results for retrieving nature representations in imperfect OCR data extracted from 17-19 century German texts. This research was done in the Ottoman Nature in Travelogues (ONiT) project and funded by the Austrian Science Fund (FWF: P 35245). The text retrieval was done with hybrid vector/lexical search (BM25) by using a Marqo vector index. The texts were indexed as one page per document unit, and by splitting them in 2-sentence vectors and embedding them with flax-sentence-embeddings/all_datasets_v4_mpnet-base model. Results Cleaned OCR contain the retrieval results for the vectorized OCR texts that were cleaned by using regular expressions. Results LLM Preprocessed OCR contain the retrieval results for the vectorized OCR texts that were automatically corrected with Llama3.1:70b. Results Original OCR contain the retrieval results for the original OCR texts (without any preprocessing).

For more information, contact michela(dot)vignoli(at)ait(dot)ac(dot)at.

""") demo.launch()