Spaces:

librarian-bots
/

huggingface-semantic-search

Running

App Files Files Community

davanstrien HF Staff commited on Sep 9, 2024

Commit

eb9f45f

1 Parent(s): fc80ecb

add search by text

Browse files

Files changed (1) hide show

app.py +56 -6

app.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import asyncio
 import re
 from typing import Dict, List
 import gradio as gr
 import httpx
-from huggingface_hub import ModelCard
 from cashews import cache
 cache.setup("mem://")
 API_URL = "https://davanstrien-huggingface-datasets-search-v2.hf.space/similar"
@@ -24,6 +24,34 @@ async def fetch_similar_datasets(dataset_id: str, limit: int = 10) -> List[Dict]
         return []
 async def fetch_dataset_card(dataset_id: str) -> str:
     url = README_URL_TEMPLATE.format(dataset_id)
     async with httpx.AsyncClient() as client:
@@ -128,14 +156,24 @@ with gr.Blocks() as demo:
     gr.Markdown("## &#129303; Dataset Similarity Search")
     with gr.Row():
         gr.Markdown(
-            "This Gradio app allows you to find similar datasets based on a given dataset ID. "
-            "Enter a dataset ID (e.g., 'airtrain-ai/fineweb-edu-fortified') to find similar datasets with previews of their dataset cards."
         )
     with gr.Row():
         dataset_id = gr.Textbox(
             value="airtrain-ai/fineweb-edu-fortified",
             label="Dataset ID (e.g., airtrain-ai/fineweb-edu-fortified)",
         )
     with gr.Row():
         search_btn = gr.Button("Search Similar Datasets")
@@ -148,11 +186,23 @@ with gr.Blocks() as demo:
         )
     results = gr.Markdown()
     search_btn.click(
-        lambda dataset_id, limit: asyncio.run(
             search_similar_datasets(dataset_id, limit)
         ),
-        inputs=[dataset_id, max_results],
         outputs=results,
     )

 import asyncio
+import json
 import re
 from typing import Dict, List
 import gradio as gr
 import httpx
 from cashews import cache
+from huggingface_hub import ModelCard
 cache.setup("mem://")
 API_URL = "https://davanstrien-huggingface-datasets-search-v2.hf.space/similar"
         return []
+async def fetch_similar_datasets_by_text(query: str, limit: int = 10) -> List[Dict]:
+    async with httpx.AsyncClient() as client:
+        response = await client.post(
+            f"{API_URL}_by_text", params={"query": query, "n": limit + 1}
+        )
+        if response.status_code == 200:
+            results = response.json()["results"]
+            return results[:limit]
+        return []
+async def search_similar_datasets_by_text(query: str, limit: int = 10):
+    results = await fetch_similar_datasets_by_text(query, limit)
+    if not results:
+        return "No similar datasets found."
+    # Fetch dataset cards and info concurrently
+    dataset_cards = await asyncio.gather(
+        *[fetch_dataset_card(result["dataset_id"]) for result in results]
+    )
+    dataset_infos = await asyncio.gather(
+        *[fetch_dataset_info(result["dataset_id"]) for result in results]
+    )
+    return format_results(results, dataset_cards, dataset_infos)
 async def fetch_dataset_card(dataset_id: str) -> str:
     url = README_URL_TEMPLATE.format(dataset_id)
     async with httpx.AsyncClient() as client:
     gr.Markdown("## &#129303; Dataset Similarity Search")
     with gr.Row():
         gr.Markdown(
+            "This Gradio app allows you to find similar datasets based on a given dataset ID or a text query. "
+            "Choose the search type and enter either a dataset ID or a text query to find similar datasets with previews of their dataset cards."
         )
+    with gr.Row():
+        search_type = gr.Radio(
+            ["Dataset ID", "Text Query"], label="Search Type", value="Dataset ID"
+        )
     with gr.Row():
         dataset_id = gr.Textbox(
             value="airtrain-ai/fineweb-edu-fortified",
             label="Dataset ID (e.g., airtrain-ai/fineweb-edu-fortified)",
         )
+        text_query = gr.Textbox(
+            label="Text Query (e.g., 'natural language processing dataset')",
+            visible=False,
+        )
     with gr.Row():
         search_btn = gr.Button("Search Similar Datasets")
         )
     results = gr.Markdown()
+    def toggle_input_visibility(choice):
+        return gr.update(visible=choice == "Dataset ID"), gr.update(
+            visible=choice == "Text Query"
+        )
+    search_type.change(
+        toggle_input_visibility, inputs=[search_type], outputs=[dataset_id, text_query]
+    )
     search_btn.click(
+        lambda search_type, dataset_id, text_query, limit: asyncio.run(
             search_similar_datasets(dataset_id, limit)
+            if search_type == "Dataset ID"
+            else search_similar_datasets_by_text(text_query, limit)
         ),
+        inputs=[search_type, dataset_id, text_query, max_results],
         outputs=results,
     )