Spaces:
Sleeping
Sleeping
Michela
commited on
Commit
Β·
c59454b
1
Parent(s):
099f9b1
Update app.py
Browse files
app.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2 |
import gradio as gr
|
3 |
import pandas as pd
|
4 |
from difflib import SequenceMatcher
|
|
|
5 |
|
6 |
# Import results
|
7 |
results_clean = pd.read_csv("data/retrieval_results/sonnini_cleaned/i_onit-sonnini-DHd2025-clean-q_Pferd, Pferde.csv").head(100)
|
@@ -45,6 +46,8 @@ def preview_results(page, selected_data_source):
|
|
45 |
row_elements = []
|
46 |
for idx, (_, row) in enumerate(results.iterrows(), start=start_idx + 1):
|
47 |
highlighted_text = row['unpacked_highlights']
|
|
|
|
|
48 |
row_html = f"""
|
49 |
<div style='border:1px solid #ddd; padding:10px; margin:5px 0; font-size: 18px;'>
|
50 |
<b>{idx}. \'{row['document']}\'</b> - Score: {row['_score']} - Rank: {row['rank']}
|
@@ -85,7 +88,12 @@ def show_details(document_name, selected_data_source):
|
|
85 |
|
86 |
# Gradio Interface
|
87 |
with gr.Blocks() as demo:
|
88 |
-
gr.Markdown("
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
data_source_dropdown = gr.Dropdown(choices=list(data_sources.keys()), label="Select Data Source", value="Results Cleaned OCR")
|
91 |
page_slider = gr.Slider(1, 1, step=1, label="Page", interactive=True)
|
@@ -129,7 +137,7 @@ with gr.Blocks() as demo:
|
|
129 |
# Further information block at the end
|
130 |
gr.Markdown("""
|
131 |
## π Further Information
|
132 |
-
<div style="font-size:
|
133 |
<p>This demo lets you explore our preliminary results for retrieving <i>nature</i> representations in imperfect OCR data extracted from 17-19 century German texts.
|
134 |
This research was done in the <a href="https://onit.oeaw.ac.at/">Ottoman Nature in Travelogues (ONiT)</a> project and funded by the Austrian Science Fund (FWF: P 35245).
|
135 |
The text retrieval was done with hybrid vector/lexical search (BM25) by using a <a href="https://docs.marqo.ai/">Marqo</a>
|
|
|
2 |
import gradio as gr
|
3 |
import pandas as pd
|
4 |
from difflib import SequenceMatcher
|
5 |
+
import re
|
6 |
|
7 |
# Import results
|
8 |
results_clean = pd.read_csv("data/retrieval_results/sonnini_cleaned/i_onit-sonnini-DHd2025-clean-q_Pferd, Pferde.csv").head(100)
|
|
|
46 |
row_elements = []
|
47 |
for idx, (_, row) in enumerate(results.iterrows(), start=start_idx + 1):
|
48 |
highlighted_text = row['unpacked_highlights']
|
49 |
+
# Highlight "Pferd" and "Pferde" using a span with a yellow background
|
50 |
+
highlighted_text = re.sub(r'\b(Pferd\w*)\b', r"<span style='background-color: yellow; font-weight: bold;'>\1</span>", highlighted_text, flags=re.IGNORECASE)
|
51 |
row_html = f"""
|
52 |
<div style='border:1px solid #ddd; padding:10px; margin:5px 0; font-size: 18px;'>
|
53 |
<b>{idx}. \'{row['document']}\'</b> - Score: {row['_score']} - Rank: {row['rank']}
|
|
|
88 |
|
89 |
# Gradio Interface
|
90 |
with gr.Blocks() as demo:
|
91 |
+
gr.Markdown("""
|
92 |
+
## π Preview Text Retrieval Results with Marqo Vector Database
|
93 |
+
<div style="font-size: 16px;">
|
94 |
+
<p><b>Instructions:</b> Browse through the retrieval results for the text prompt <i>"Pferd, Pferde"</i> by sliding the page slider (up to 100 first retrieval results can be inspected).
|
95 |
+
To visualise details about the retrieved text chunk, copy and paste the document name (e.g. <i>Z166069305_430</i>) in the search bar below and click on the <i>Inspect</i> button. Please note that pressing <i>Enter</i> does not work.</p>
|
96 |
+
</div>""")
|
97 |
|
98 |
data_source_dropdown = gr.Dropdown(choices=list(data_sources.keys()), label="Select Data Source", value="Results Cleaned OCR")
|
99 |
page_slider = gr.Slider(1, 1, step=1, label="Page", interactive=True)
|
|
|
137 |
# Further information block at the end
|
138 |
gr.Markdown("""
|
139 |
## π Further Information
|
140 |
+
<div style="font-size: 16px;">
|
141 |
<p>This demo lets you explore our preliminary results for retrieving <i>nature</i> representations in imperfect OCR data extracted from 17-19 century German texts.
|
142 |
This research was done in the <a href="https://onit.oeaw.ac.at/">Ottoman Nature in Travelogues (ONiT)</a> project and funded by the Austrian Science Fund (FWF: P 35245).
|
143 |
The text retrieval was done with hybrid vector/lexical search (BM25) by using a <a href="https://docs.marqo.ai/">Marqo</a>
|