Michela commited on
Commit
c59454b
Β·
1 Parent(s): 099f9b1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -2
app.py CHANGED
@@ -2,6 +2,7 @@
2
  import gradio as gr
3
  import pandas as pd
4
  from difflib import SequenceMatcher
 
5
 
6
  # Import results
7
  results_clean = pd.read_csv("data/retrieval_results/sonnini_cleaned/i_onit-sonnini-DHd2025-clean-q_Pferd, Pferde.csv").head(100)
@@ -45,6 +46,8 @@ def preview_results(page, selected_data_source):
45
  row_elements = []
46
  for idx, (_, row) in enumerate(results.iterrows(), start=start_idx + 1):
47
  highlighted_text = row['unpacked_highlights']
 
 
48
  row_html = f"""
49
  <div style='border:1px solid #ddd; padding:10px; margin:5px 0; font-size: 18px;'>
50
  <b>{idx}. \'{row['document']}\'</b> - Score: {row['_score']} - Rank: {row['rank']}
@@ -85,7 +88,12 @@ def show_details(document_name, selected_data_source):
85
 
86
  # Gradio Interface
87
  with gr.Blocks() as demo:
88
- gr.Markdown("## πŸ” Preview Text Retrieval Results with Marqo Vector Database")
 
 
 
 
 
89
 
90
  data_source_dropdown = gr.Dropdown(choices=list(data_sources.keys()), label="Select Data Source", value="Results Cleaned OCR")
91
  page_slider = gr.Slider(1, 1, step=1, label="Page", interactive=True)
@@ -129,7 +137,7 @@ with gr.Blocks() as demo:
129
  # Further information block at the end
130
  gr.Markdown("""
131
  ## πŸ“š Further Information
132
- <div style="font-size: 18px;">
133
  <p>This demo lets you explore our preliminary results for retrieving <i>nature</i> representations in imperfect OCR data extracted from 17-19 century German texts.
134
  This research was done in the <a href="https://onit.oeaw.ac.at/">Ottoman Nature in Travelogues (ONiT)</a> project and funded by the Austrian Science Fund (FWF: P 35245).
135
  The text retrieval was done with hybrid vector/lexical search (BM25) by using a <a href="https://docs.marqo.ai/">Marqo</a>
 
2
  import gradio as gr
3
  import pandas as pd
4
  from difflib import SequenceMatcher
5
+ import re
6
 
7
  # Import results
8
  results_clean = pd.read_csv("data/retrieval_results/sonnini_cleaned/i_onit-sonnini-DHd2025-clean-q_Pferd, Pferde.csv").head(100)
 
46
  row_elements = []
47
  for idx, (_, row) in enumerate(results.iterrows(), start=start_idx + 1):
48
  highlighted_text = row['unpacked_highlights']
49
+ # Highlight "Pferd" and "Pferde" using a span with a yellow background
50
+ highlighted_text = re.sub(r'\b(Pferd\w*)\b', r"<span style='background-color: yellow; font-weight: bold;'>\1</span>", highlighted_text, flags=re.IGNORECASE)
51
  row_html = f"""
52
  <div style='border:1px solid #ddd; padding:10px; margin:5px 0; font-size: 18px;'>
53
  <b>{idx}. \'{row['document']}\'</b> - Score: {row['_score']} - Rank: {row['rank']}
 
88
 
89
  # Gradio Interface
90
  with gr.Blocks() as demo:
91
+ gr.Markdown("""
92
+ ## πŸ” Preview Text Retrieval Results with Marqo Vector Database
93
+ <div style="font-size: 16px;">
94
+ <p><b>Instructions:</b> Browse through the retrieval results for the text prompt <i>"Pferd, Pferde"</i> by sliding the page slider (up to 100 first retrieval results can be inspected).
95
+ To visualise details about the retrieved text chunk, copy and paste the document name (e.g. <i>Z166069305_430</i>) in the search bar below and click on the <i>Inspect</i> button. Please note that pressing <i>Enter</i> does not work.</p>
96
+ </div>""")
97
 
98
  data_source_dropdown = gr.Dropdown(choices=list(data_sources.keys()), label="Select Data Source", value="Results Cleaned OCR")
99
  page_slider = gr.Slider(1, 1, step=1, label="Page", interactive=True)
 
137
  # Further information block at the end
138
  gr.Markdown("""
139
  ## πŸ“š Further Information
140
+ <div style="font-size: 16px;">
141
  <p>This demo lets you explore our preliminary results for retrieving <i>nature</i> representations in imperfect OCR data extracted from 17-19 century German texts.
142
  This research was done in the <a href="https://onit.oeaw.ac.at/">Ottoman Nature in Travelogues (ONiT)</a> project and funded by the Austrian Science Fund (FWF: P 35245).
143
  The text retrieval was done with hybrid vector/lexical search (BM25) by using a <a href="https://docs.marqo.ai/">Marqo</a>