# Import packages import gradio as gr import pandas as pd from difflib import SequenceMatcher # Import results results_clean = pd.read_csv("data/retrieval_results/sonnini_cleaned/i_onit-sonnini-DHd2025-clean-q_Pferd, Pferde.csv").head(100) results_prep = pd.read_csv("data/retrieval_results/sonnini_llm_corrected/i_onit-sonnini-DHd2025-prep-q_Pferd, Pferde.csv").head(100) results_orig = pd.read_csv("data/retrieval_results/sonnini_original_ocr/i_onit-test-index-sonnini-q_Pferd-Pferde.csv").head(100) # Drop 'text_prep' from results_orig results_clean.drop(columns=['text_prep'], inplace=True) # Modify the "document" column to remove "_page175.txt" and keep the "Z166069305_00175" results_orig['document'] = results_orig['document'].str[:-12] # Modify the "page" column to extract the numeric part and remove leading zeroes results_orig['page'] = results_orig['page'].str.extract(r'(\d+)', expand=False).astype(int) data_sources = {"Results Cleaned OCR": results_clean, "Results LLM Preprocessed OCR": results_prep, "Results Original OCR": results_orig} # Pagination settings R = 5 # Number of preview rows per page # Define a function to highlight parts of the text def highlight_text(text, highlights): # Ensure highlights is a list of strings if isinstance(highlights, str): highlights = [highlights] # Wrap each highlight in tags for highlight in highlights: # Replace highlight text with a highlighted version text = text.replace(highlight, f'{highlight}') return text # Function to create preview rows def preview_results(page, selected_data_source): data_source = data_sources[selected_data_source] start_idx = (page - 1) * R end_idx = min(start_idx + R, len(data_source)) results = data_source.iloc[start_idx:end_idx] row_elements = [] for idx, (_, row) in enumerate(results.iterrows(), start=start_idx + 1): highlighted_text = row['unpacked_highlights'] row_html = f"""

{idx}. \'{row['document']}\' - Score: {row['_score']} - Rank: {row['rank']}
{highlighted_text}

""" row_elements.append(row_html) return "".join(row_elements) # Function to show details of a selected row def show_details(document_name, selected_data_source): data_source = data_sources[selected_data_source] row = data_source[data_source["document"] == document_name] if row.empty: return "

Document not found. Please select a valid document.

" row = row.iloc[0] # Extract first matching row return f"""

📄 Preview: {row['barcode']}, Page {row['page']}

Retrieved text chunk: {row["unpacked_highlights"]}

OCR text (LLM-corrected): {highlight_text(row.get('text_prep') or row.get('text_clean') or row.get('text'), row["unpacked_highlights"])}

🔍 Open ÖNB Viewer

Source: C. S. Sonnini's, ehemaligen Offiziers und Jngenieurs des französischen Seewesens
und Mitgliedes mehrerer gelehrten und litterarischen Gesellschaften,
Reisen in Ober= und Niederägypten, Bd. 1. Leipzig/Gera: Wilh. Heinsius, 1800

Citation link: http://data.onb.ac.at/rep/1058B194

""" # Gradio Interface with gr.Blocks() as demo: gr.Markdown("## 🔍 Preview Text Retrieval Results with Marqo Vector Database") data_source_dropdown = gr.Dropdown(choices=list(data_sources.keys()), label="Select Data Source", value="Results Cleaned OCR") page_slider = gr.Slider(1, 1, step=1, label="Page", interactive=True) preview_output = gr.HTML() gr.Markdown("## 📝 Inspect Document Details") doc_name_input = gr.Textbox(label="Copy and paste document name to search bar (e.g. Z166069305_430):", interactive=True) inspect_button = gr.Button("Inspect") inspect_output = gr.HTML() # Function to update preview when data source changes def update_data_source(selected_data_source): max_page = (len(data_sources[selected_data_source]) // R) + 1 page_slider.maximum = max_page # Update the max page count dynamically return preview_results(1, selected_data_source), 1 # Reset slider to 1 # Function to update preview when page slider changes def update_preview(page, selected_data_source): return preview_results(page, selected_data_source) # Function to update document details def update_details(doc_name, selected_data_source): return show_details(doc_name, selected_data_source) # Handle data source change data_source_dropdown.change( update_data_source, inputs=[data_source_dropdown], outputs=[preview_output, page_slider] # Update both preview and reset slider ) # Handle page slider change page_slider.change(update_preview, inputs=[page_slider, data_source_dropdown], outputs=[preview_output]) # Handle inspect button click inspect_button.click(update_details, inputs=[doc_name_input, data_source_dropdown], outputs=[inspect_output]) # Initialize preview with default data source preview_output.value = update_data_source("Results Cleaned OCR") # Further information block at the end gr.Markdown(""" ## 📚 Further Information

This demo lets you explore our preliminary results for retrieving nature representations in imperfect OCR data extracted from 17-19 century German texts. This research was done in the Ottoman Nature in Travelogues (ONiT) project and funded by the Austrian Science Fund (FWF: P 35245). The text retrieval was done with hybrid vector/lexical search (BM25) by using a Marqo vector index. The texts were indexed as one page per document unit, and by splitting them in 2-sentence vectors and embedding them with flax-sentence-embeddings/all_datasets_v4_mpnet-base model.

For more information, contact michela(dot)vignoli(at)ait(dot)ac(dot)at.

""") demo.launch()