Spaces:
Sleeping
Sleeping
File size: 7,088 Bytes
e62e0c5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
# Import packages
import gradio as gr
import pandas as pd
from difflib import SequenceMatcher
# Import results
results_clean = pd.read_csv("data/retrieval_results/sonnini_cleaned/i_onit-sonnini-DHd2025-clean-q_Pferd, Pferde.csv").head(100)
results_prep = pd.read_csv("data/retrieval_results/sonnini_llm_corrected/i_onit-sonnini-DHd2025-prep-q_Pferd, Pferde.csv").head(100)
results_orig = pd.read_csv("data/retrieval_results/sonnini_original_ocr/i_onit-test-index-sonnini-q_Pferd-Pferde.csv").head(100)
# Drop 'text_prep' from results_orig
results_clean.drop(columns=['text_prep'], inplace=True)
# Modify the "document" column to remove "_page175.txt" and keep the "Z166069305_00175"
results_orig['document'] = results_orig['document'].str[:-12]
# Modify the "page" column to extract the numeric part and remove leading zeroes
results_orig['page'] = results_orig['page'].str.extract(r'(\d+)', expand=False).astype(int)
data_sources = {"Results Cleaned OCR": results_clean, "Results LLM Preprocessed OCR": results_prep, "Results Original OCR": results_orig}
# Pagination settings
R = 5 # Number of preview rows per page
# Define a function to highlight parts of the text
def highlight_text(text, highlights):
# Ensure highlights is a list of strings
if isinstance(highlights, str):
highlights = [highlights]
# Wrap each highlight in <mark> tags
for highlight in highlights:
# Replace highlight text with a highlighted version
text = text.replace(highlight, f'<mark>{highlight}</mark>')
return text
# Function to create preview rows
def preview_results(page, selected_data_source):
data_source = data_sources[selected_data_source]
start_idx = (page - 1) * R
end_idx = min(start_idx + R, len(data_source))
results = data_source.iloc[start_idx:end_idx]
row_elements = []
for idx, (_, row) in enumerate(results.iterrows(), start=start_idx + 1):
highlighted_text = row['unpacked_highlights']
row_html = f"""
<div style='border:1px solid #ddd; padding:10px; margin:5px 0; font-size: 18px;'>
<b>{idx}. \'{row['document']}\'</b> - Score: {row['_score']} - Rank: {row['rank']}
<br><i>{highlighted_text}</i>
</div>
"""
row_elements.append(row_html)
return "".join(row_elements)
# Function to show details of a selected row
def show_details(document_name, selected_data_source):
data_source = data_sources[selected_data_source]
row = data_source[data_source["document"] == document_name]
if row.empty:
return "<p style='color:red;'>Document not found. Please select a valid document.</p>"
row = row.iloc[0] # Extract first matching row
return f"""
<div style="display: flex; justify-content: space-between; align-items: start;">
<div style="width: 65%; font-size: 18px;">
<h3>📄 Preview: {row['barcode']}, Page {row['page']}</h3>
<p><b>Retrieved text chunk: </b><i>{row["unpacked_highlights"]}</i></p>
<p><b>OCR text (LLM-corrected): </b>{highlight_text(row.get('text_prep') or row.get('text_clean') or row.get('text'), row["unpacked_highlights"])}</p>
<p><a href="https://digital.onb.ac.at/OnbViewer/viewer.faces?doc=ABO_%2B{row['barcode']}&order={row['page']}&view=SINGLE" target="_blank">🔍 Open ÖNB Viewer</a></p>
</div>
<div style="width: 30%; text-align: right;">
<img src="{row['iiif_link']}" alt="IIIF Image Preview"
style="max-width: 100%; height: auto; border: 1px solid #ddd;">
</div>
</div>
<div style="font-size: 18px;">
<p><b>Source: </b>C. S. Sonnini's, ehemaligen Offiziers und Jngenieurs des französischen Seewesens <br>und Mitgliedes mehrerer gelehrten und litterarischen Gesellschaften, <br><i>Reisen in Ober= und Niederägypten</i>, Bd. 1. Leipzig/Gera: Wilh. Heinsius, 1800</p>
<p><b>Citation link: <a href="http://data.onb.ac.at/rep/1058B194"target="_blank">http://data.onb.ac.at/rep/1058B194</a></p>
</div>
"""
# Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("## 🔍 Preview Text Retrieval Results with Marqo Vector Database")
data_source_dropdown = gr.Dropdown(choices=list(data_sources.keys()), label="Select Data Source", value="Results Cleaned OCR")
page_slider = gr.Slider(1, 1, step=1, label="Page", interactive=True)
preview_output = gr.HTML()
gr.Markdown("## 📝 Inspect Document Details")
doc_name_input = gr.Textbox(label="Copy and paste document name to search bar (e.g. Z166069305_430):", interactive=True)
inspect_button = gr.Button("Inspect")
inspect_output = gr.HTML()
# Function to update preview when data source changes
def update_data_source(selected_data_source):
max_page = (len(data_sources[selected_data_source]) // R) + 1
page_slider.maximum = max_page # Update the max page count dynamically
return preview_results(1, selected_data_source), 1 # Reset slider to 1
# Function to update preview when page slider changes
def update_preview(page, selected_data_source):
return preview_results(page, selected_data_source)
# Function to update document details
def update_details(doc_name, selected_data_source):
return show_details(doc_name, selected_data_source)
# Handle data source change
data_source_dropdown.change(
update_data_source,
inputs=[data_source_dropdown],
outputs=[preview_output, page_slider] # Update both preview and reset slider
)
# Handle page slider change
page_slider.change(update_preview, inputs=[page_slider, data_source_dropdown], outputs=[preview_output])
# Handle inspect button click
inspect_button.click(update_details, inputs=[doc_name_input, data_source_dropdown], outputs=[inspect_output])
# Initialize preview with default data source
preview_output.value = update_data_source("Results Cleaned OCR")
# Further information block at the end
gr.Markdown("""
## 📚 Further Information
<div style="font-size: 18px;">
<p>This demo lets you explore our preliminary results for retrieving <i>nature</i> representations in imperfect OCR data extracted from 17-19 century German texts.
This research was done in the <a href="https://onit.oeaw.ac.at/">Ottoman Nature in Travelogues (ONiT)</a> project and funded by the Austrian Science Fund (FWF: P 35245).
The text retrieval was done with hybrid vector/lexical search (BM25) by using a <a href="https://docs.marqo.ai/">Marqo</a>
vector index. The texts were indexed as one page per document unit, and by splitting them in 2-sentence vectors and embedding them with
<a href="https://huggingface.co/flax-sentence-embeddings/all_datasets_v4_mpnet-base">flax-sentence-embeddings/all_datasets_v4_mpnet-base</a> model.</p>
<p>For more information, contact <a href="mailto:[email protected]">michela(dot)vignoli(at)ait(dot)ac(dot)at</a>.</p>
</div>
""")
demo.launch() |