Spaces:

librarian-bots
/

huggingface-semantic-search

Running

App Files Files Community

davanstrien HF staff commited on Sep 11, 2024

Commit

dd2978a

1 Parent(s): a93f0c7

add ragatouille search

Browse files

Files changed (2) hide show

app.py +66 -54
ragatouille_search.py +107 -0

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import gradio as gr
 import httpx
 from cashews import cache
 from huggingface_hub import ModelCard
 cache.setup("mem://")
 API_URL = "https://davanstrien-huggingface-datasets-search-v2.hf.space/similar"
@@ -150,61 +151,72 @@ async def search_similar_datasets(dataset_id: str, limit: int = 10):
 with gr.Blocks() as demo:
-    gr.Markdown("## &#129303; Dataset Similarity Search")
-    with gr.Row():
-        gr.Markdown(
-            "This Gradio app allows you to find similar datasets based on a given dataset ID or a text query. "
-            "Choose the search type and enter either a dataset ID or a text query to find similar datasets with previews of their dataset cards.\n\n"
-            "For a seamless experience on the Hugging Face website, check out the "
-            "[Hugging Face Similar Chrome extension](https://chromewebstore.google.com/detail/hugging-face-similar/aijelnjllajooinkcpkpbhckbghghpnl?authuser=0&hl=en). "
-            "This extension adds a 'Similar Datasets' section directly to Hugging Face dataset pages, "
-            "making it even easier to discover related datasets for your projects."
-        )
-    with gr.Row():
-        search_type = gr.Radio(
-            ["Dataset ID", "Text Query"], label="Search Type", value="Dataset ID"
-        )
-    with gr.Row():
-        dataset_id = gr.Textbox(
-            value="airtrain-ai/fineweb-edu-fortified",
-            label="Dataset ID (e.g., airtrain-ai/fineweb-edu-fortified)",
-        )
-        text_query = gr.Textbox(
-            label="Text Query (e.g., 'natural language processing dataset')",
-            visible=False,
-        )
-    with gr.Row():
-        search_btn = gr.Button("Search Similar Datasets")
-        max_results = gr.Slider(
-            minimum=1,
-            maximum=50,
-            step=1,
-            value=10,
-            label="Maximum number of results",
-        )
-    results = gr.Markdown()
-    def toggle_input_visibility(choice):
-        return gr.update(visible=choice == "Dataset ID"), gr.update(
-            visible=choice == "Text Query"
-        )
-    search_type.change(
-        toggle_input_visibility, inputs=[search_type], outputs=[dataset_id, text_query]
-    )
-    search_btn.click(
-        lambda search_type, dataset_id, text_query, limit: asyncio.run(
-            search_similar_datasets(dataset_id, limit)
-            if search_type == "Dataset ID"
-            else search_similar_datasets_by_text(text_query, limit)
-        ),
-        inputs=[search_type, dataset_id, text_query, max_results],
-        outputs=results,
-    )
 demo.launch()

 import httpx
 from cashews import cache
 from huggingface_hub import ModelCard
+from ragatouille_search import create_ragatouille_interface, search_with_ragatouille
 cache.setup("mem://")
 API_URL = "https://davanstrien-huggingface-datasets-search-v2.hf.space/similar"
 with gr.Blocks() as demo:
+    gr.Markdown("## &#129303; Dataset Search and Similarity")
+    with gr.Tabs():
+        with gr.TabItem("Similar Datasets"):
+            gr.Markdown("## &#129303; Dataset Similarity Search")
+            with gr.Row():
+                gr.Markdown(
+                    "This Gradio app allows you to find similar datasets based on a given dataset ID or a text query. "
+                    "Choose the search type and enter either a dataset ID or a text query to find similar datasets with previews of their dataset cards.\n\n"
+                    "For a seamless experience on the Hugging Face website, check out the "
+                    "[Hugging Face Similar Chrome extension](https://chromewebstore.google.com/detail/hugging-face-similar/aijelnjllajooinkcpkpbhckbghghpnl?authuser=0&hl=en). "
+                    "This extension adds a 'Similar Datasets' section directly to Hugging Face dataset pages, "
+                    "making it even easier to discover related datasets for your projects."
+                )
+            with gr.Row():
+                search_type = gr.Radio(
+                    ["Dataset ID", "Text Query"],
+                    label="Search Type",
+                    value="Dataset ID",
+                )
+            with gr.Row():
+                dataset_id = gr.Textbox(
+                    value="airtrain-ai/fineweb-edu-fortified",
+                    label="Dataset ID (e.g., airtrain-ai/fineweb-edu-fortified)",
+                )
+                text_query = gr.Textbox(
+                    label="Text Query (e.g., 'natural language processing dataset')",
+                    visible=False,
+                )
+            with gr.Row():
+                search_btn = gr.Button("Search Similar Datasets")
+                max_results = gr.Slider(
+                    minimum=1,
+                    maximum=50,
+                    step=1,
+                    value=10,
+                    label="Maximum number of results",
+                )
+            results = gr.Markdown()
+            def toggle_input_visibility(choice):
+                return gr.update(visible=choice == "Dataset ID"), gr.update(
+                    visible=choice == "Text Query"
+                )
+            search_type.change(
+                toggle_input_visibility,
+                inputs=[search_type],
+                outputs=[dataset_id, text_query],
+            )
+            search_btn.click(
+                lambda search_type, dataset_id, text_query, limit: asyncio.run(
+                    search_similar_datasets(dataset_id, limit)
+                    if search_type == "Dataset ID"
+                    else search_similar_datasets_by_text(text_query, limit)
+                ),
+                inputs=[search_type, dataset_id, text_query, max_results],
+                outputs=results,
+            )
+        with gr.TabItem("RAGatouille Search"):
+            ragatouille_interface = create_ragatouille_interface()
 demo.launch()

ragatouille_search.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from pathlib import Path
+import gradio as gr
+from huggingface_hub import snapshot_download
+from ragatouille import RAGPretrainedModel
+from toolz import unique
+from typing import List, Dict, Any
+# Top-level variables
+INDEX_PATH = Path(".ragatouille/colbert/indexes/my_index_with_ids_and_metadata/")
+REPO_ID = "davanstrien/search-index"
+INITIAL_QUERY = "hello world"
+DEFAULT_K = 10
+def initialize_index():
+    INDEX_PATH.mkdir(parents=True, exist_ok=True)
+    snapshot_download(REPO_ID, repo_type="dataset", local_dir=INDEX_PATH)
+    rag = RAGPretrainedModel.from_index(INDEX_PATH)
+    # Warm up index
+    rag.search(INITIAL_QUERY)
+    return rag
+def format_results_as_markdown(results: List[Dict[str, Any]]) -> str:
+    markdown = ""
+    for result in results:
+        content = result["content"]
+        score = result["score"]
+        rank = result["rank"]
+        document_id = result["document_id"]
+        passage_id = result["passage_id"]
+        link = f"https://huggingface.co/datasets/{document_id}"
+        markdown += f"### Result {rank}\n"
+        markdown += f"**Score:** {score}\n\n"
+        markdown += f"**Document ID:** [{document_id}]({link})\n\n"
+        markdown += f"**Passage ID:** {passage_id}\n\n"
+        # Limit initial content display to 1000 characters
+        preview = f"{content[:1000]}..." if len(content) > 1000 else content
+        markdown += f"{preview}\n\n"
+        # Add expandable section for full content if it's longer than 1000 characters
+        if len(content) > 1000:
+            markdown += "<details>\n"
+            markdown += "<summary>Click to expand full content</summary>\n\n"
+            markdown += f"{content}\n\n"
+            markdown += "</details>\n\n"
+        markdown += "---\n\n"
+    return markdown
+def search_with_ragatouille(query, k=DEFAULT_K, make_unique=False):
+    results = RAG.search(query, k=k)
+    if make_unique:
+        results = make_results_unique(results)
+    return format_results_as_markdown(results)
+def make_results_unique(results: List[Dict[str, Any]]):
+    unique_results = unique(results, lambda x: x["document_id"])
+    return list(unique_results)
+def create_ragatouille_interface():
+    with gr.Blocks() as ragatouille_demo:
+        gr.Markdown("### RAGatouille Dataset Search")
+        gr.Markdown(
+            """This interface allows you to search inside dataset cards on the Hub using the [answerai-colbert-small-v1](https://huggingface.co/answerdotai/answerai-colbert-small-v1) ColBERT model via [RAGatouille](https://github.com/AnswerDotAI/RAGatouille). Please be aware that this is an early prototype and may not work as expected!
+            ## Notes:
+            **Not all datasets are indexed yet!**
+            For a dataset to be indexed:
+            - It must have a dataset card on the Hub. You can find documentation on how to write a good dataset card [here](https://huggingface.co/docs/hub/datasets-cards).
+            - The dataset must have at least 1 like and 1 download
+            - The card must be a minimum length (to weed out low quality cards)
+            **At the moment the index is refreshed when I decide to do it, so it may not be up to date.** If there is sufficient interest I will implement a daily refresh (give this repo a like if you'd like this feature!)
+            Feel free to open a discussion to give feedback or request features &#129303;
+            """
+        )
+        query = gr.Textbox(label="Query")
+        k = gr.Slider(1, 100, value=DEFAULT_K, step=1, label="Number of Results")
+        make_unique = gr.Checkbox(False, label="Unique Results")
+        search_button = gr.Button("Search")
+        search_button.click(
+            search_with_ragatouille,
+            inputs=[query, k, make_unique],
+            outputs=gr.Markdown(label="Results"),
+        )
+    return ragatouille_demo
+# Initialize RAG globally
+RAG = initialize_index()
+def main():
+    demo = create_ragatouille_interface()
+    demo.launch()
+if __name__ == "__main__":
+    main()