Spaces:

RockMi
/

onit-text-analysis

Running

Vignoli Michela

Update app.py

a3eb292 5 months ago

11.4 kB

	"""
	Simple Gradio app to preview the preliminary results for retrieving nature representations in imperfect OCR data extracted from 17-19 century German texts in the ONiT project.
	Code by Michela Vignoli partially generated with Chat GPT3, GPT4 (free version), and Claude (free version).
	"""

	# Import packages
	import gradio as gr
	import pandas as pd
	from difflib import SequenceMatcher
	import re

	# Import results
	results_clean = pd.read_csv("data/retrieval_results/sonnini_cleaned/i_onit-sonnini-DHd2025-clean-q_Pferd, Pferde.csv").head(100)
	results_prep = pd.read_csv("data/retrieval_results/sonnini_llm_corrected/i_onit-sonnini-DHd2025-prep-q_Pferd, Pferde.csv").head(100)
	results_orig = pd.read_csv("data/retrieval_results/sonnini_original_OCR/i_onit-test-index-sonnini-q_Pferd-Pferde.csv").head(100)
	annotations = pd.read_csv("data/annotations/DHd2025_referenceReports_annotations_preview_horses.csv")

	# Drop 'text_prep' from results_orig
	results_clean.drop(columns=['text_prep'], inplace=True)

	# Modify the "document" column to remove "_page175.txt" and keep the "Z166069305_00175"
	results_orig['document'] = results_orig['document'].str[:-12]

	# Modify the "page" column to extract the numeric part and remove leading zeroes
	results_orig['page'] = results_orig['page'].str.extract(r'(\d+)', expand=False).astype(int)

	data_sources = {"Results Cleaned OCR": results_clean, "Results LLM Preprocessed OCR": results_prep, "Results Original OCR": results_orig, "Annotations": annotations}

	# Pagination settings
	R = 5 # Number of preview rows per page

	def normalize_text(text):
	"""Normalize text for better matching by removing extra whitespace and standardizing characters."""
	# Remove extra whitespace
	text = ' '.join(text.split())
	# Could add more normalization steps here if needed
	return text

	def find_best_match(needle, haystack):
	"""Find the best matching position of needle in haystack using fuzzy matching."""
	matcher = SequenceMatcher(None, needle, haystack)
	matches = matcher.get_matching_blocks()

	# Find the best match that exceeds our threshold
	best_match = None
	best_match_ratio = 0.5 # Initialize the best match ratio with our minimum threshold

	for match in matches:
	i, j, n = match
	if n > 0: # Only consider non-zero length matches
	subsequence = haystack[j:j+n]
	ratio = SequenceMatcher(None, needle, subsequence).ratio()
	if ratio > best_match_ratio:
	best_match = (j, j+n)
	best_match_ratio = ratio

	return best_match

	def highlight_text(text, highlights):
	"""
	Highlight specified text segments using fuzzy matching and HTML mark tags.

	Args:
	text (str): The original text to highlight
	highlights (str or list): Text segment(s) to highlight

	Returns:
	str: Text with highlights wrapped in <mark> tags
	"""
	if not text or not highlights:
	return text

	# Ensure highlights is a list
	if isinstance(highlights, str):
	highlights = [highlights]

	# Remove empty or None highlights
	highlights = [h for h in highlights if h]
	if not highlights:
	return text

	# Sort highlights by length (longest first) to avoid nested highlights
	highlights = sorted(highlights, key=len, reverse=True)

	# Store positions to highlight
	positions_to_highlight = []

	# Find positions for each highlight
	for highlight in highlights:
	normalized_highlight = normalize_text(highlight)
	normalized_text = normalize_text(text)

	match = find_best_match(normalized_highlight, normalized_text)
	if match:
	start, end = match
	# Convert positions back to original text
	original_start = len(text[:start].rstrip())
	original_end = original_start + len(text[start:end].strip())
	positions_to_highlight.append((original_start, original_end))

	# Sort positions by start position
	positions_to_highlight.sort()

	# Apply highlights from end to start to avoid position shifting
	for start, end in reversed(positions_to_highlight):
	text = f"{text[:start]}<mark>{text[start:end]}</mark>{text[end:]}"

	return text

	# Function to create preview rows
	def preview_results(page, selected_data_source):
	data_source = data_sources[selected_data_source]
	start_idx = (page - 1) * R
	end_idx = min(start_idx + R, len(data_source))

	results = data_source.iloc[start_idx:end_idx]

	row_elements = []
	for idx, (_, row) in enumerate(results.iterrows(), start=start_idx + 1):
	highlighted_text = row['unpacked_highlights']
	# Highlight "Pferd" and "Pferde" using a span with a yellow background
	highlighted_text = re.sub(r'\b(Pferd\w*)\b', r"<span style='background-color: yellow; font-weight: bold;'>\1</span>", highlighted_text, flags=re.IGNORECASE)
	row_html = f"""
	<div style='border:1px solid #ddd; padding:10px; margin:5px 0; font-size: 18px;'>
	<b>{idx}. \'{row['document']}\'</b> - Score: {row['_score']} - Rank: {row['rank']}
	<br><i>{highlighted_text}</i>
	</div>
	"""
	row_elements.append(row_html)

	return "".join(row_elements)

	# Function to show details of a selected row
	def show_details(document_name, selected_data_source):
	data_source = data_sources[selected_data_source]
	row = data_source[data_source["document"] == document_name]

	if row.empty:
	return "<p style='color:red;'>Document not found. Please select a valid document.</p>"

	row = row.iloc[0] # Extract first matching row
	return f"""
	<div style="display: flex; justify-content: space-between; align-items: start;">
	<div style="width: 65%; font-size: 18px;">
	<h3>📄 Preview: {document_name}</h3>
	<p><b>Retrieved text chunk: </b><i>{row["unpacked_highlights"]}</i></p>
	<p><b>Text on page {row['page']}: </b>{highlight_text(row.get('text_prep') or row.get('text_clean') or row.get('text'), row["unpacked_highlights"])}</p>
	<p><a href="https://digital.onb.ac.at/OnbViewer/viewer.faces?doc=ABO_%2B{row['barcode']}&order={row['page']}&view=SINGLE" target="_blank">🔍 Open ÖNB Viewer</a></p>
	</div>
	<div style="width: 30%; text-align: right;">
	<img src="{row['iiif_link']}" alt="IIIF Image Preview"
	style="max-width: 100%; height: auto; border: 1px solid #ddd;">
	</div>
	</div>
	<div style="font-size: 18px;">
	<p><b>Source: </b>C. S. Sonnini's, ehemaligen Offiziers und Jngenieurs des französischen Seewesens <br>und Mitgliedes mehrerer gelehrten und litterarischen Gesellschaften, <br><i>Reisen in Ober= und Niederägypten</i>, Bd. 1. Leipzig/Gera: Wilh. Heinsius, 1800</p>
	<p><b>Citation link: <a href="http://data.onb.ac.at/rep/1058B194"target="_blank">http://data.onb.ac.at/rep/1058B194</a></p>
	</div>
	"""

	# Gradio Interface
	with gr.Blocks() as demo:
	gr.Markdown("""
	## 🔍 Preview Text Retrieval Results with Marqo Vector Database
	<div style="font-size: 18px;">
	<p><b>Instructions:</b> Browse through the retrieval results for the text prompt <i>"Pferd, Pferde"</i> by sliding the page slider (up to 100 first retrieval results can be inspected).
	Select the data source: Choose between <i>Results Cleaned OCR, Results LLM Preprocessed OCR, Results Original OCR,</i> and our <i>Annotations</i> of text passages mentioning <i>horses and kindred animals</i> in the text.
	To visualise details about the retrieved text chunk, copy and paste the document name (e.g. <i>Z166069305_430</i>) in the search bar below and click on the <i>Inspect</i> button.
	Please note that pressing <i>Enter</i> does not work.
	To inspect the page in the full book, click on <i>Open ONB Viewer</i> in the document details below.</p>
	</div>""")

	data_source_dropdown = gr.Dropdown(choices=list(data_sources.keys()), label="Select Data Source", value="Results Cleaned OCR")
	page_slider = gr.Slider(1, 1, step=1, label="Page", interactive=True)
	preview_output = gr.HTML()

	gr.Markdown("## 📝 Inspect Document Details")

	doc_name_input = gr.Textbox(label="Copy and paste document name to search bar (e.g. Z166069305_430):", interactive=True)
	inspect_button = gr.Button("Inspect")
	inspect_output = gr.HTML()

	# Function to update preview when data source changes
	def update_data_source(selected_data_source):
	max_page = (len(data_sources[selected_data_source]) // R) + 1
	page_slider.maximum = max_page # Update the max page count dynamically
	return preview_results(1, selected_data_source), 1 # Reset slider to 1

	# Function to update preview when page slider changes
	def update_preview(page, selected_data_source):
	return preview_results(page, selected_data_source)

	# Function to update document details
	def update_details(doc_name, selected_data_source):
	return show_details(doc_name, selected_data_source)

	# Handle data source change
	data_source_dropdown.change(
	update_data_source,
	inputs=[data_source_dropdown],
	outputs=[preview_output, page_slider] # Update both preview and reset slider
	)
	# Handle page slider change
	page_slider.change(update_preview, inputs=[page_slider, data_source_dropdown], outputs=[preview_output])

	# Handle inspect button click
	inspect_button.click(update_details, inputs=[doc_name_input, data_source_dropdown], outputs=[inspect_output])

	# Initialize preview with default data source
	preview_output.value = update_data_source("Results Cleaned OCR")

	# Further information block at the end
	gr.Markdown("""
	## 📚 Further Information
	<div style="font-size: 18px;">
	<p>This demo lets you explore our preliminary results for retrieving <i>nature</i> representations in imperfect OCR data extracted from 17-19 century German texts.
	This research was done in the <a href="https://onit.oeaw.ac.at/">Ottoman Nature in Travelogues (ONiT)</a> project and funded by the Austrian Science Fund (FWF: P 35245).
	The text retrieval was done with hybrid vector/lexical search (BM25) by using a <a href="https://docs.marqo.ai/">Marqo</a>
	vector index. The texts were indexed as one page per document unit, and by splitting them in 2-sentence vectors and embedding them with
	<a href="https://huggingface.co/flax-sentence-embeddings/all_datasets_v4_mpnet-base">flax-sentence-embeddings/all_datasets_v4_mpnet-base</a> model.
	<i>Results Cleaned OCR</i> contain the retrieval results for the vectorized OCR texts that were cleaned by using regular expressions.
	<i>Results LLM Preprocessed OCR</i> contain the retrieval results for the vectorized OCR texts that were automatically corrected with Llama3.1:70b.
	<i>Results Original OCR</i> contain the retrieval results for the original OCR texts (without any preprocessing).</p>
	<p>For more information, contact <a href="mailto:[email protected]">michela(dot)vignoli(at)ait(dot)ac(dot)at</a>.</p>
	</div>
	""")

	demo.launch()