Spaces:

RockMi
/

onit-text-analysis

Sleeping

File size: 11,414 Bytes

"""
Simple Gradio app to preview the preliminary results for retrieving nature representations in imperfect OCR data extracted from 17-19 century German texts in the ONiT project.
Code by Michela Vignoli partially generated with Chat GPT3, GPT4 (free version), and Claude (free version).
"""

# Import packages
import gradio as gr
import pandas as pd
from difflib import SequenceMatcher
import re

# Import results
results_clean = pd.read_csv("data/retrieval_results/sonnini_cleaned/i_onit-sonnini-DHd2025-clean-q_Pferd, Pferde.csv").head(100)
results_prep = pd.read_csv("data/retrieval_results/sonnini_llm_corrected/i_onit-sonnini-DHd2025-prep-q_Pferd, Pferde.csv").head(100)
results_orig = pd.read_csv("data/retrieval_results/sonnini_original_OCR/i_onit-test-index-sonnini-q_Pferd-Pferde.csv").head(100)
annotations = pd.read_csv("data/annotations/DHd2025_referenceReports_annotations_preview_horses.csv")

# Drop 'text_prep' from results_orig
results_clean.drop(columns=['text_prep'], inplace=True)

# Modify the "document" column to remove "_page175.txt" and keep the "Z166069305_00175"
results_orig['document'] = results_orig['document'].str[:-12]

# Modify the "page" column to extract the numeric part and remove leading zeroes
results_orig['page'] = results_orig['page'].str.extract(r'(\d+)', expand=False).astype(int)

data_sources = {"Results Cleaned OCR": results_clean, "Results LLM Preprocessed OCR": results_prep, "Results Original OCR": results_orig, "Annotations": annotations}

# Pagination settings
R = 5  # Number of preview rows per page

def normalize_text(text):
    """Normalize text for better matching by removing extra whitespace and standardizing characters."""
    # Remove extra whitespace
    text = ' '.join(text.split())
    # Could add more normalization steps here if needed
    return text

def find_best_match(needle, haystack):
    """Find the best matching position of needle in haystack using fuzzy matching."""
    matcher = SequenceMatcher(None, needle, haystack)
    matches = matcher.get_matching_blocks()
    
    # Find the best match that exceeds our threshold
    best_match = None
    best_match_ratio = 0.5  # Initialize the best match ratio with our minimum threshold
    
    for match in matches:
        i, j, n = match
        if n > 0:  # Only consider non-zero length matches
            subsequence = haystack[j:j+n]
            ratio = SequenceMatcher(None, needle, subsequence).ratio()
            if ratio > best_match_ratio:
                best_match = (j, j+n)
                best_match_ratio = ratio
    
    return best_match

def highlight_text(text, highlights):
    """
    Highlight specified text segments using fuzzy matching and HTML mark tags.
    
    Args:
        text (str): The original text to highlight
        highlights (str or list): Text segment(s) to highlight
    
    Returns:
        str: Text with highlights wrapped in <mark> tags
    """
    if not text or not highlights:
        return text
    
    # Ensure highlights is a list
    if isinstance(highlights, str):
        highlights = [highlights]
    
    # Remove empty or None highlights
    highlights = [h for h in highlights if h]
    if not highlights:
        return text
    
    # Sort highlights by length (longest first) to avoid nested highlights
    highlights = sorted(highlights, key=len, reverse=True)
    
    # Store positions to highlight
    positions_to_highlight = []
    
    # Find positions for each highlight
    for highlight in highlights:
        normalized_highlight = normalize_text(highlight)
        normalized_text = normalize_text(text)
        
        match = find_best_match(normalized_highlight, normalized_text)
        if match:
            start, end = match
            # Convert positions back to original text
            original_start = len(text[:start].rstrip())
            original_end = original_start + len(text[start:end].strip())
            positions_to_highlight.append((original_start, original_end))
    
    # Sort positions by start position
    positions_to_highlight.sort()
    
    # Apply highlights from end to start to avoid position shifting
    for start, end in reversed(positions_to_highlight):
        text = f"{text[:start]}<mark>{text[start:end]}</mark>{text[end:]}"
    
    return text

# Function to create preview rows
def preview_results(page, selected_data_source):
    data_source = data_sources[selected_data_source]
    start_idx = (page - 1) * R
    end_idx = min(start_idx + R, len(data_source))
    
    results = data_source.iloc[start_idx:end_idx]

    row_elements = []
    for idx, (_, row) in enumerate(results.iterrows(), start=start_idx + 1):
        highlighted_text = row['unpacked_highlights']
        # Highlight "Pferd" and "Pferde" using a span with a yellow background
        highlighted_text = re.sub(r'\b(Pferd\w*)\b', r"<span style='background-color: yellow; font-weight: bold;'>\1</span>", highlighted_text, flags=re.IGNORECASE)
        row_html = f"""
        <div style='border:1px solid #ddd; padding:10px; margin:5px 0; font-size: 18px;'>
            <b>{idx}. \'{row['document']}\'</b> - Score: {row['_score']} - Rank: {row['rank']}
            <br><i>{highlighted_text}</i>
        </div>
        """
        row_elements.append(row_html)

    return "".join(row_elements)

# Function to show details of a selected row
def show_details(document_name, selected_data_source):
    data_source = data_sources[selected_data_source]
    row = data_source[data_source["document"] == document_name]
    
    if row.empty:
        return "<p style='color:red;'>Document not found. Please select a valid document.</p>"

    row = row.iloc[0]  # Extract first matching row
    return f"""
    <div style="display: flex; justify-content: space-between; align-items: start;">
        <div style="width: 65%; font-size: 18px;">
            <h3>📄 Preview: {document_name}</h3>
            <p><b>Retrieved text chunk: </b><i>{row["unpacked_highlights"]}</i></p>
            <p><b>Text on page {row['page']}: </b>{highlight_text(row.get('text_prep') or row.get('text_clean') or row.get('text'), row["unpacked_highlights"])}</p>
            <p><a href="https://digital.onb.ac.at/OnbViewer/viewer.faces?doc=ABO_%2B{row['barcode']}&order={row['page']}&view=SINGLE" target="_blank">🔍 Open ÖNB Viewer</a></p>
        </div>
        <div style="width: 30%; text-align: right;">
            <img src="{row['iiif_link']}" alt="IIIF Image Preview" 
                 style="max-width: 100%; height: auto; border: 1px solid #ddd;">
        </div>
    </div>
    <div style="font-size: 18px;">
        <p><b>Source: </b>C. S. Sonnini's, ehemaligen Offiziers und Jngenieurs des französischen Seewesens <br>und Mitgliedes mehrerer gelehrten und litterarischen Gesellschaften, <br><i>Reisen in Ober= und Niederägypten</i>, Bd. 1. Leipzig/Gera: Wilh. Heinsius, 1800</p>
        <p><b>Citation link: <a href="http://data.onb.ac.at/rep/1058B194"target="_blank">http://data.onb.ac.at/rep/1058B194</a></p>
    </div>
    """

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("""
                ## 🔍 Preview Text Retrieval Results with Marqo Vector Database
                <div style="font-size: 18px;">
                <p><b>Instructions:</b> Browse through the retrieval results for the text prompt <i>"Pferd, Pferde"</i> by sliding the page slider (up to 100 first retrieval results can be inspected). 
                Select the data source: Choose between <i>Results Cleaned OCR, Results LLM Preprocessed OCR, Results Original OCR,</i> and our <i>Annotations</i> of text passages mentioning <i>horses and kindred animals</i> in the text. 
                To visualise details about the retrieved text chunk, copy and paste the document name (e.g. <i>Z166069305_430</i>) in the search bar below and click on the <i>Inspect</i> button. 
                Please note that pressing <i>Enter</i> does not work. 
                To inspect the page in the full book, click on <i>Open ONB Viewer</i> in the document details below.</p>
                </div>""")

    data_source_dropdown = gr.Dropdown(choices=list(data_sources.keys()), label="Select Data Source", value="Results Cleaned OCR")
    page_slider = gr.Slider(1, 1, step=1, label="Page", interactive=True)
    preview_output = gr.HTML()

    gr.Markdown("## 📝 Inspect Document Details")
    
    doc_name_input = gr.Textbox(label="Copy and paste document name to search bar (e.g. Z166069305_430):", interactive=True)
    inspect_button = gr.Button("Inspect")
    inspect_output = gr.HTML()

    # Function to update preview when data source changes
    def update_data_source(selected_data_source):
        max_page = (len(data_sources[selected_data_source]) // R) + 1
        page_slider.maximum = max_page  # Update the max page count dynamically
        return preview_results(1, selected_data_source), 1  # Reset slider to 1

    # Function to update preview when page slider changes
    def update_preview(page, selected_data_source):
        return preview_results(page, selected_data_source)

    # Function to update document details
    def update_details(doc_name, selected_data_source):
        return show_details(doc_name, selected_data_source)

    # Handle data source change
    data_source_dropdown.change(
        update_data_source,
        inputs=[data_source_dropdown], 
        outputs=[preview_output, page_slider]  # Update both preview and reset slider
    )
    # Handle page slider change
    page_slider.change(update_preview, inputs=[page_slider, data_source_dropdown], outputs=[preview_output])

    # Handle inspect button click
    inspect_button.click(update_details, inputs=[doc_name_input, data_source_dropdown], outputs=[inspect_output])

    # Initialize preview with default data source
    preview_output.value = update_data_source("Results Cleaned OCR")
    
    # Further information block at the end
    gr.Markdown("""
    ## 📚 Further Information
    <div style="font-size: 18px;">
        <p>This demo lets you explore our preliminary results for retrieving <i>nature</i> representations in imperfect OCR data extracted from 17-19 century German texts.
        This research was done in the <a href="https://onit.oeaw.ac.at/">Ottoman Nature in Travelogues (ONiT)</a> project and funded by the Austrian Science Fund (FWF: P 35245).
        The text retrieval was done with hybrid vector/lexical search (BM25) by using a <a href="https://docs.marqo.ai/">Marqo</a> 
        vector index. The texts were indexed as one page per document unit, and by splitting them in 2-sentence vectors and embedding them with 
        <a href="https://huggingface.co/flax-sentence-embeddings/all_datasets_v4_mpnet-base">flax-sentence-embeddings/all_datasets_v4_mpnet-base</a> model. 
        <i>Results Cleaned OCR</i> contain the retrieval results for the vectorized OCR texts that were cleaned by using regular expressions. 
        <i>Results LLM Preprocessed OCR</i> contain the retrieval results for the vectorized OCR texts that were automatically corrected with Llama3.1:70b. 
        <i>Results Original OCR</i> contain the retrieval results for the original OCR texts (without any preprocessing).</p>
        <p>For more information, contact <a href="mailto:[email protected]">michela(dot)vignoli(at)ait(dot)ac(dot)at</a>.</p>
    </div>
    """)

demo.launch()