Michela commited on
Commit
559c653
Β·
1 Parent(s): a9cd115

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -9
app.py CHANGED
@@ -1,3 +1,8 @@
 
 
 
 
 
1
  # Import packages
2
  import gradio as gr
3
  import pandas as pd
@@ -23,18 +28,84 @@ data_sources = {"Results Cleaned OCR": results_clean, "Results LLM Preprocessed
23
  # Pagination settings
24
  R = 5 # Number of preview rows per page
25
 
26
- # Define a function to highlight parts of the text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def highlight_text(text, highlights):
28
- # Ensure highlights is a list of strings
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  if isinstance(highlights, str):
30
  highlights = [highlights]
31
- # Wrap each highlight in <mark> tags
 
 
 
 
 
 
 
 
 
 
 
 
32
  for highlight in highlights:
33
- # Replace highlight text with a highlighted version
34
- text = text.replace(highlight, f'<mark>{highlight}</mark>')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  return text
36
 
37
-
38
  # Function to create preview rows
39
  def preview_results(page, selected_data_source):
40
  data_source = data_sources[selected_data_source]
@@ -70,9 +141,9 @@ def show_details(document_name, selected_data_source):
70
  return f"""
71
  <div style="display: flex; justify-content: space-between; align-items: start;">
72
  <div style="width: 65%; font-size: 18px;">
73
- <h3>πŸ“„ Preview: {row['barcode']}, Page {row['page']}</h3>
74
  <p><b>Retrieved text chunk: </b><i>{row["unpacked_highlights"]}</i></p>
75
- <p><b>OCR text (LLM-corrected): </b>{highlight_text(row.get('text_prep') or row.get('text_clean') or row.get('text'), row["unpacked_highlights"])}</p>
76
  <p><a href="https://digital.onb.ac.at/OnbViewer/viewer.faces?doc=ABO_%2B{row['barcode']}&order={row['page']}&view=SINGLE" target="_blank">πŸ” Open Γ–NB Viewer</a></p>
77
  </div>
78
  <div style="width: 30%; text-align: right;">
@@ -92,7 +163,8 @@ with gr.Blocks() as demo:
92
  ## πŸ” Preview Text Retrieval Results with Marqo Vector Database
93
  <div style="font-size: 18px;">
94
  <p><b>Instructions:</b> Browse through the retrieval results for the text prompt <i>"Pferd, Pferde"</i> by sliding the page slider (up to 100 first retrieval results can be inspected).
95
- To visualise details about the retrieved text chunk, copy and paste the document name (e.g. <i>Z166069305_430</i>) in the search bar below and click on the <i>Inspect</i> button. Please note that pressing <i>Enter</i> does not work.</p>
 
96
  </div>""")
97
 
98
  data_source_dropdown = gr.Dropdown(choices=list(data_sources.keys()), label="Select Data Source", value="Results Cleaned OCR")
 
1
+ """
2
+ Simple Gradio app to preview the preliminary results for retrieving nature representations in imperfect OCR data extracted from 17-19 century German texts in the ONiT project.
3
+ Code by Michela Vignoli partially generated with Chat GPT3, GPT4 (free version), and Claude (free version).
4
+ """
5
+
6
  # Import packages
7
  import gradio as gr
8
  import pandas as pd
 
28
  # Pagination settings
29
  R = 5 # Number of preview rows per page
30
 
31
+ def normalize_text(text):
32
+ """Normalize text for better matching by removing extra whitespace and standardizing characters."""
33
+ # Remove extra whitespace
34
+ text = ' '.join(text.split())
35
+ # Could add more normalization steps here if needed
36
+ return text
37
+
38
+ def find_best_match(needle, haystack):
39
+ """Find the best matching position of needle in haystack using fuzzy matching."""
40
+ matcher = SequenceMatcher(None, needle, haystack)
41
+ matches = matcher.get_matching_blocks()
42
+
43
+ # Find the best match that exceeds our threshold
44
+ best_match = None
45
+ best_match_ratio = 0.9 # Initialize the best match ratio with our minimum threshold
46
+
47
+ for match in matches:
48
+ i, j, n = match
49
+ if n > 0: # Only consider non-zero length matches
50
+ subsequence = haystack[j:j+n]
51
+ ratio = SequenceMatcher(None, needle, subsequence).ratio()
52
+ if ratio > best_match_ratio:
53
+ best_match = (j, j+n)
54
+ best_match_ratio = ratio
55
+
56
+ return best_match
57
+
58
  def highlight_text(text, highlights):
59
+ """
60
+ Highlight specified text segments using fuzzy matching and HTML mark tags.
61
+
62
+ Args:
63
+ text (str): The original text to highlight
64
+ highlights (str or list): Text segment(s) to highlight
65
+
66
+ Returns:
67
+ str: Text with highlights wrapped in <mark> tags
68
+ """
69
+ if not text or not highlights:
70
+ return text
71
+
72
+ # Ensure highlights is a list
73
  if isinstance(highlights, str):
74
  highlights = [highlights]
75
+
76
+ # Remove empty or None highlights
77
+ highlights = [h for h in highlights if h]
78
+ if not highlights:
79
+ return text
80
+
81
+ # Sort highlights by length (longest first) to avoid nested highlights
82
+ highlights = sorted(highlights, key=len, reverse=True)
83
+
84
+ # Store positions to highlight
85
+ positions_to_highlight = []
86
+
87
+ # Find positions for each highlight
88
  for highlight in highlights:
89
+ normalized_highlight = normalize_text(highlight)
90
+ normalized_text = normalize_text(text)
91
+
92
+ match = find_best_match(normalized_highlight, normalized_text)
93
+ if match:
94
+ start, end = match
95
+ # Convert positions back to original text
96
+ original_start = len(text[:start].rstrip())
97
+ original_end = original_start + len(text[start:end].strip())
98
+ positions_to_highlight.append((original_start, original_end))
99
+
100
+ # Sort positions by start position
101
+ positions_to_highlight.sort()
102
+
103
+ # Apply highlights from end to start to avoid position shifting
104
+ for start, end in reversed(positions_to_highlight):
105
+ text = f"{text[:start]}<mark>{text[start:end]}</mark>{text[end:]}"
106
+
107
  return text
108
 
 
109
  # Function to create preview rows
110
  def preview_results(page, selected_data_source):
111
  data_source = data_sources[selected_data_source]
 
141
  return f"""
142
  <div style="display: flex; justify-content: space-between; align-items: start;">
143
  <div style="width: 65%; font-size: 18px;">
144
+ <h3>πŸ“„ Preview: {document_name}</h3>
145
  <p><b>Retrieved text chunk: </b><i>{row["unpacked_highlights"]}</i></p>
146
+ <p><b>Text on page {row['page']}: </b>{highlight_text(row.get('text_prep') or row.get('text_clean') or row.get('text'), row["unpacked_highlights"])}</p>
147
  <p><a href="https://digital.onb.ac.at/OnbViewer/viewer.faces?doc=ABO_%2B{row['barcode']}&order={row['page']}&view=SINGLE" target="_blank">πŸ” Open Γ–NB Viewer</a></p>
148
  </div>
149
  <div style="width: 30%; text-align: right;">
 
163
  ## πŸ” Preview Text Retrieval Results with Marqo Vector Database
164
  <div style="font-size: 18px;">
165
  <p><b>Instructions:</b> Browse through the retrieval results for the text prompt <i>"Pferd, Pferde"</i> by sliding the page slider (up to 100 first retrieval results can be inspected).
166
+ To visualise details about the retrieved text chunk, copy and paste the document name (e.g. <i>Z166069305_430</i>) in the search bar below and click on the <i>Inspect</i> button. Please note that pressing <i>Enter</i> does not work.
167
+ To inspect the page in the full book, click on <i>Open ONB Viewer</i> in the document details below.</p>
168
  </div>""")
169
 
170
  data_source_dropdown = gr.Dropdown(choices=list(data_sources.keys()), label="Select Data Source", value="Results Cleaned OCR")