Spaces:
Sleeping
Sleeping
Michela
commited on
Commit
Β·
559c653
1
Parent(s):
a9cd115
Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Import packages
|
2 |
import gradio as gr
|
3 |
import pandas as pd
|
@@ -23,18 +28,84 @@ data_sources = {"Results Cleaned OCR": results_clean, "Results LLM Preprocessed
|
|
23 |
# Pagination settings
|
24 |
R = 5 # Number of preview rows per page
|
25 |
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
def highlight_text(text, highlights):
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
if isinstance(highlights, str):
|
30 |
highlights = [highlights]
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
for highlight in highlights:
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
return text
|
36 |
|
37 |
-
|
38 |
# Function to create preview rows
|
39 |
def preview_results(page, selected_data_source):
|
40 |
data_source = data_sources[selected_data_source]
|
@@ -70,9 +141,9 @@ def show_details(document_name, selected_data_source):
|
|
70 |
return f"""
|
71 |
<div style="display: flex; justify-content: space-between; align-items: start;">
|
72 |
<div style="width: 65%; font-size: 18px;">
|
73 |
-
<h3>π Preview: {
|
74 |
<p><b>Retrieved text chunk: </b><i>{row["unpacked_highlights"]}</i></p>
|
75 |
-
<p><b>
|
76 |
<p><a href="https://digital.onb.ac.at/OnbViewer/viewer.faces?doc=ABO_%2B{row['barcode']}&order={row['page']}&view=SINGLE" target="_blank">π Open ΓNB Viewer</a></p>
|
77 |
</div>
|
78 |
<div style="width: 30%; text-align: right;">
|
@@ -92,7 +163,8 @@ with gr.Blocks() as demo:
|
|
92 |
## π Preview Text Retrieval Results with Marqo Vector Database
|
93 |
<div style="font-size: 18px;">
|
94 |
<p><b>Instructions:</b> Browse through the retrieval results for the text prompt <i>"Pferd, Pferde"</i> by sliding the page slider (up to 100 first retrieval results can be inspected).
|
95 |
-
To visualise details about the retrieved text chunk, copy and paste the document name (e.g. <i>Z166069305_430</i>) in the search bar below and click on the <i>Inspect</i> button. Please note that pressing <i>Enter</i> does not work
|
|
|
96 |
</div>""")
|
97 |
|
98 |
data_source_dropdown = gr.Dropdown(choices=list(data_sources.keys()), label="Select Data Source", value="Results Cleaned OCR")
|
|
|
1 |
+
"""
|
2 |
+
Simple Gradio app to preview the preliminary results for retrieving nature representations in imperfect OCR data extracted from 17-19 century German texts in the ONiT project.
|
3 |
+
Code by Michela Vignoli partially generated with Chat GPT3, GPT4 (free version), and Claude (free version).
|
4 |
+
"""
|
5 |
+
|
6 |
# Import packages
|
7 |
import gradio as gr
|
8 |
import pandas as pd
|
|
|
28 |
# Pagination settings
|
29 |
R = 5 # Number of preview rows per page
|
30 |
|
31 |
+
def normalize_text(text):
|
32 |
+
"""Normalize text for better matching by removing extra whitespace and standardizing characters."""
|
33 |
+
# Remove extra whitespace
|
34 |
+
text = ' '.join(text.split())
|
35 |
+
# Could add more normalization steps here if needed
|
36 |
+
return text
|
37 |
+
|
38 |
+
def find_best_match(needle, haystack):
|
39 |
+
"""Find the best matching position of needle in haystack using fuzzy matching."""
|
40 |
+
matcher = SequenceMatcher(None, needle, haystack)
|
41 |
+
matches = matcher.get_matching_blocks()
|
42 |
+
|
43 |
+
# Find the best match that exceeds our threshold
|
44 |
+
best_match = None
|
45 |
+
best_match_ratio = 0.9 # Initialize the best match ratio with our minimum threshold
|
46 |
+
|
47 |
+
for match in matches:
|
48 |
+
i, j, n = match
|
49 |
+
if n > 0: # Only consider non-zero length matches
|
50 |
+
subsequence = haystack[j:j+n]
|
51 |
+
ratio = SequenceMatcher(None, needle, subsequence).ratio()
|
52 |
+
if ratio > best_match_ratio:
|
53 |
+
best_match = (j, j+n)
|
54 |
+
best_match_ratio = ratio
|
55 |
+
|
56 |
+
return best_match
|
57 |
+
|
58 |
def highlight_text(text, highlights):
|
59 |
+
"""
|
60 |
+
Highlight specified text segments using fuzzy matching and HTML mark tags.
|
61 |
+
|
62 |
+
Args:
|
63 |
+
text (str): The original text to highlight
|
64 |
+
highlights (str or list): Text segment(s) to highlight
|
65 |
+
|
66 |
+
Returns:
|
67 |
+
str: Text with highlights wrapped in <mark> tags
|
68 |
+
"""
|
69 |
+
if not text or not highlights:
|
70 |
+
return text
|
71 |
+
|
72 |
+
# Ensure highlights is a list
|
73 |
if isinstance(highlights, str):
|
74 |
highlights = [highlights]
|
75 |
+
|
76 |
+
# Remove empty or None highlights
|
77 |
+
highlights = [h for h in highlights if h]
|
78 |
+
if not highlights:
|
79 |
+
return text
|
80 |
+
|
81 |
+
# Sort highlights by length (longest first) to avoid nested highlights
|
82 |
+
highlights = sorted(highlights, key=len, reverse=True)
|
83 |
+
|
84 |
+
# Store positions to highlight
|
85 |
+
positions_to_highlight = []
|
86 |
+
|
87 |
+
# Find positions for each highlight
|
88 |
for highlight in highlights:
|
89 |
+
normalized_highlight = normalize_text(highlight)
|
90 |
+
normalized_text = normalize_text(text)
|
91 |
+
|
92 |
+
match = find_best_match(normalized_highlight, normalized_text)
|
93 |
+
if match:
|
94 |
+
start, end = match
|
95 |
+
# Convert positions back to original text
|
96 |
+
original_start = len(text[:start].rstrip())
|
97 |
+
original_end = original_start + len(text[start:end].strip())
|
98 |
+
positions_to_highlight.append((original_start, original_end))
|
99 |
+
|
100 |
+
# Sort positions by start position
|
101 |
+
positions_to_highlight.sort()
|
102 |
+
|
103 |
+
# Apply highlights from end to start to avoid position shifting
|
104 |
+
for start, end in reversed(positions_to_highlight):
|
105 |
+
text = f"{text[:start]}<mark>{text[start:end]}</mark>{text[end:]}"
|
106 |
+
|
107 |
return text
|
108 |
|
|
|
109 |
# Function to create preview rows
|
110 |
def preview_results(page, selected_data_source):
|
111 |
data_source = data_sources[selected_data_source]
|
|
|
141 |
return f"""
|
142 |
<div style="display: flex; justify-content: space-between; align-items: start;">
|
143 |
<div style="width: 65%; font-size: 18px;">
|
144 |
+
<h3>π Preview: {document_name}</h3>
|
145 |
<p><b>Retrieved text chunk: </b><i>{row["unpacked_highlights"]}</i></p>
|
146 |
+
<p><b>Text on page {row['page']}: </b>{highlight_text(row.get('text_prep') or row.get('text_clean') or row.get('text'), row["unpacked_highlights"])}</p>
|
147 |
<p><a href="https://digital.onb.ac.at/OnbViewer/viewer.faces?doc=ABO_%2B{row['barcode']}&order={row['page']}&view=SINGLE" target="_blank">π Open ΓNB Viewer</a></p>
|
148 |
</div>
|
149 |
<div style="width: 30%; text-align: right;">
|
|
|
163 |
## π Preview Text Retrieval Results with Marqo Vector Database
|
164 |
<div style="font-size: 18px;">
|
165 |
<p><b>Instructions:</b> Browse through the retrieval results for the text prompt <i>"Pferd, Pferde"</i> by sliding the page slider (up to 100 first retrieval results can be inspected).
|
166 |
+
To visualise details about the retrieved text chunk, copy and paste the document name (e.g. <i>Z166069305_430</i>) in the search bar below and click on the <i>Inspect</i> button. Please note that pressing <i>Enter</i> does not work.
|
167 |
+
To inspect the page in the full book, click on <i>Open ONB Viewer</i> in the document details below.</p>
|
168 |
</div>""")
|
169 |
|
170 |
data_source_dropdown = gr.Dropdown(choices=list(data_sources.keys()), label="Select Data Source", value="Results Cleaned OCR")
|