davanstrien HF staff commited on
Commit
eb9f45f
·
1 Parent(s): fc80ecb

add search by text

Browse files
Files changed (1) hide show
  1. app.py +56 -6
app.py CHANGED
@@ -1,12 +1,12 @@
1
  import asyncio
 
2
  import re
3
  from typing import Dict, List
4
 
5
  import gradio as gr
6
  import httpx
7
- from huggingface_hub import ModelCard
8
  from cashews import cache
9
-
10
 
11
  cache.setup("mem://")
12
  API_URL = "https://davanstrien-huggingface-datasets-search-v2.hf.space/similar"
@@ -24,6 +24,34 @@ async def fetch_similar_datasets(dataset_id: str, limit: int = 10) -> List[Dict]
24
  return []
25
 
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  async def fetch_dataset_card(dataset_id: str) -> str:
28
  url = README_URL_TEMPLATE.format(dataset_id)
29
  async with httpx.AsyncClient() as client:
@@ -128,14 +156,24 @@ with gr.Blocks() as demo:
128
  gr.Markdown("## 🤗 Dataset Similarity Search")
129
  with gr.Row():
130
  gr.Markdown(
131
- "This Gradio app allows you to find similar datasets based on a given dataset ID. "
132
- "Enter a dataset ID (e.g., 'airtrain-ai/fineweb-edu-fortified') to find similar datasets with previews of their dataset cards."
133
  )
 
 
 
 
 
 
134
  with gr.Row():
135
  dataset_id = gr.Textbox(
136
  value="airtrain-ai/fineweb-edu-fortified",
137
  label="Dataset ID (e.g., airtrain-ai/fineweb-edu-fortified)",
138
  )
 
 
 
 
139
 
140
  with gr.Row():
141
  search_btn = gr.Button("Search Similar Datasets")
@@ -148,11 +186,23 @@ with gr.Blocks() as demo:
148
  )
149
 
150
  results = gr.Markdown()
 
 
 
 
 
 
 
 
 
 
151
  search_btn.click(
152
- lambda dataset_id, limit: asyncio.run(
153
  search_similar_datasets(dataset_id, limit)
 
 
154
  ),
155
- inputs=[dataset_id, max_results],
156
  outputs=results,
157
  )
158
 
 
1
  import asyncio
2
+ import json
3
  import re
4
  from typing import Dict, List
5
 
6
  import gradio as gr
7
  import httpx
 
8
  from cashews import cache
9
+ from huggingface_hub import ModelCard
10
 
11
  cache.setup("mem://")
12
  API_URL = "https://davanstrien-huggingface-datasets-search-v2.hf.space/similar"
 
24
  return []
25
 
26
 
27
+ async def fetch_similar_datasets_by_text(query: str, limit: int = 10) -> List[Dict]:
28
+ async with httpx.AsyncClient() as client:
29
+ response = await client.post(
30
+ f"{API_URL}_by_text", params={"query": query, "n": limit + 1}
31
+ )
32
+ if response.status_code == 200:
33
+ results = response.json()["results"]
34
+ return results[:limit]
35
+ return []
36
+
37
+
38
+ async def search_similar_datasets_by_text(query: str, limit: int = 10):
39
+ results = await fetch_similar_datasets_by_text(query, limit)
40
+
41
+ if not results:
42
+ return "No similar datasets found."
43
+
44
+ # Fetch dataset cards and info concurrently
45
+ dataset_cards = await asyncio.gather(
46
+ *[fetch_dataset_card(result["dataset_id"]) for result in results]
47
+ )
48
+ dataset_infos = await asyncio.gather(
49
+ *[fetch_dataset_info(result["dataset_id"]) for result in results]
50
+ )
51
+
52
+ return format_results(results, dataset_cards, dataset_infos)
53
+
54
+
55
  async def fetch_dataset_card(dataset_id: str) -> str:
56
  url = README_URL_TEMPLATE.format(dataset_id)
57
  async with httpx.AsyncClient() as client:
 
156
  gr.Markdown("## 🤗 Dataset Similarity Search")
157
  with gr.Row():
158
  gr.Markdown(
159
+ "This Gradio app allows you to find similar datasets based on a given dataset ID or a text query. "
160
+ "Choose the search type and enter either a dataset ID or a text query to find similar datasets with previews of their dataset cards."
161
  )
162
+
163
+ with gr.Row():
164
+ search_type = gr.Radio(
165
+ ["Dataset ID", "Text Query"], label="Search Type", value="Dataset ID"
166
+ )
167
+
168
  with gr.Row():
169
  dataset_id = gr.Textbox(
170
  value="airtrain-ai/fineweb-edu-fortified",
171
  label="Dataset ID (e.g., airtrain-ai/fineweb-edu-fortified)",
172
  )
173
+ text_query = gr.Textbox(
174
+ label="Text Query (e.g., 'natural language processing dataset')",
175
+ visible=False,
176
+ )
177
 
178
  with gr.Row():
179
  search_btn = gr.Button("Search Similar Datasets")
 
186
  )
187
 
188
  results = gr.Markdown()
189
+
190
+ def toggle_input_visibility(choice):
191
+ return gr.update(visible=choice == "Dataset ID"), gr.update(
192
+ visible=choice == "Text Query"
193
+ )
194
+
195
+ search_type.change(
196
+ toggle_input_visibility, inputs=[search_type], outputs=[dataset_id, text_query]
197
+ )
198
+
199
  search_btn.click(
200
+ lambda search_type, dataset_id, text_query, limit: asyncio.run(
201
  search_similar_datasets(dataset_id, limit)
202
+ if search_type == "Dataset ID"
203
+ else search_similar_datasets_by_text(text_query, limit)
204
  ),
205
+ inputs=[search_type, dataset_id, text_query, max_results],
206
  outputs=results,
207
  )
208