Spaces:

ai-conferences
/

ICLR2025

Running on Zero

App Files Files Community

hysts HF Staff commited on Jul 16

Commit

c2aa37f

1 Parent(s): 496c8a5

Update

Browse files

Files changed (5) hide show

app.py +22 -49
app_mcp.py +129 -0
search.py +30 -0
semantic_search.py +0 -41
table.py +1 -1

app.py CHANGED Viewed

@@ -4,8 +4,9 @@ import gradio as gr
 import polars as pl
 from gradio_modal import Modal
 from app_pr import demo as demo_pr
-from semantic_search import semantic_search
 from table import df_orig
 DESCRIPTION = "# ICLR 2025"
@@ -59,10 +60,7 @@ df_main = df_orig.select(
 df_main = df_main.with_columns(
     [
-        pl.when(pl.col(col) == "").then(None).otherwise(pl.col(col))
-        .cast(pl.Int64)
-        .fill_null(0)
-        .alias(col)
         for col in ["upvotes", "num_comments"]
     ]
 )
@@ -120,32 +118,25 @@ def update_num_papers(df: pl.DataFrame) -> str:
 def update_df(
-    search_mode: str,
     search_query: str,
     candidate_pool_size: int,
-    score_threshold: float,
     presentation_type: str,
     column_names: list[str],
-    case_insensitive: bool = True,
 ) -> gr.Dataframe:
     df = df_main.clone()
     column_names = ["Title", *column_names]
     if search_query:
-        if search_mode == "Title Search":
-            if case_insensitive:
-                search_query = f"(?i){search_query}"
-            try:
-                df = df.filter(pl.col("Title").str.contains(search_query))
-            except pl.exceptions.ComputeError as e:
-                raise gr.Error(str(e)) from e
         else:
-            paper_ids, scores = semantic_search(search_query, candidate_pool_size, score_threshold)
-            if not paper_ids:
-                df = df.head(0)
-            else:
-                df = pl.DataFrame({"paper_id": paper_ids, "score": scores}).join(df, on="paper_id", how="inner")
-                df = df.sort("score", descending=True).drop("score")
     if presentation_type != "(ALL)":
         df = df.filter(pl.col("Type").str.contains(presentation_type))
@@ -159,10 +150,6 @@ def update_df(
     )
-def update_search_mode(search_mode: str) -> gr.Accordion:
-    return gr.Accordion(visible=search_mode == "Semantic Search")
 def df_row_selected(
     evt: gr.SelectData,
 ) -> tuple[
@@ -186,21 +173,11 @@ with gr.Blocks(css_paths="style.css") as demo:
     gr.Markdown(DESCRIPTION)
     with gr.Accordion(label="Tutorial", open=True):
         gr.Markdown(TUTORIAL)
-    with gr.Group():
-        search_mode = gr.Radio(
-            label="Search Mode",
-            choices=["Semantic Search", "Title Search"],
-            value="Semantic Search",
-            show_label=False,
-            info="Note: Semantic search consumes your ZeroGPU quota.",
-        )
-        search_query = gr.Textbox(label="Search", submit_btn=True, show_label=False, placeholder="Enter query here")
-        with gr.Accordion(label="Advanced Search Options", open=False) as advanced_search_options:
-            with gr.Row():
-                candidate_pool_size = gr.Slider(
-                    label="Candidate Pool Size", minimum=1, maximum=1000, step=1, value=300
-                )
-                score_threshold = gr.Slider(label="Score Threshold", minimum=0, maximum=1, step=0.01, value=0.5)
     presentation_type = gr.Radio(
         label="Presentation Type",
@@ -231,19 +208,12 @@ with gr.Blocks(css_paths="style.css") as demo:
         title = gr.Textbox(label="Title")
         abstract = gr.Textbox(label="Abstract")
-    search_mode.change(
-        fn=update_search_mode,
-        inputs=search_mode,
-        outputs=advanced_search_options,
-    )
     df.select(fn=df_row_selected, outputs=[abstract_modal, title, abstract])
     inputs = [
-        search_mode,
         search_query,
         candidate_pool_size,
-        score_threshold,
         presentation_type,
         column_names,
     ]
@@ -277,10 +247,13 @@ with gr.Blocks(css_paths="style.css") as demo:
         api_name=False,
     )
 with demo.route("Open PR"):
     demo_pr.render()
 if __name__ == "__main__":
-    demo.queue(api_open=False).launch(show_api=False)

 import polars as pl
 from gradio_modal import Modal
+from app_mcp import demo as demo_mcp
 from app_pr import demo as demo_pr
+from search import search
 from table import df_orig
 DESCRIPTION = "# ICLR 2025"
 df_main = df_main.with_columns(
     [
+        pl.when(pl.col(col) == "").then(None).otherwise(pl.col(col)).cast(pl.Int64).fill_null(0).alias(col)
         for col in ["upvotes", "num_comments"]
     ]
 )
 def update_df(
     search_query: str,
     candidate_pool_size: int,
+    num_results: int,
     presentation_type: str,
     column_names: list[str],
 ) -> gr.Dataframe:
+    if num_results > candidate_pool_size:
+        raise gr.Error("Number of results must be less than or equal to candidate pool size", print_exception=False)
     df = df_main.clone()
     column_names = ["Title", *column_names]
     if search_query:
+        results = search(search_query, candidate_pool_size, num_results)
+        if not results:
+            df = df.head(0)
         else:
+            df = pl.DataFrame(results).join(df, on="paper_id", how="inner")
+            df = df.sort("ce_score", descending=True).drop("ce_score")
     if presentation_type != "(ALL)":
         df = df.filter(pl.col("Type").str.contains(presentation_type))
     )
 def df_row_selected(
     evt: gr.SelectData,
 ) -> tuple[
     gr.Markdown(DESCRIPTION)
     with gr.Accordion(label="Tutorial", open=True):
         gr.Markdown(TUTORIAL)
+    search_query = gr.Textbox(label="Search", submit_btn=True, show_label=False, placeholder="Search...")
+    with gr.Accordion(label="Advanced Search Options", open=False) as advanced_search_options:
+        with gr.Row():
+            candidate_pool_size = gr.Slider(label="Candidate Pool Size", minimum=1, maximum=600, step=1, value=200)
+            num_results = gr.Slider(label="Number of Results", minimum=1, maximum=400, step=1, value=100)
     presentation_type = gr.Radio(
         label="Presentation Type",
         title = gr.Textbox(label="Title")
         abstract = gr.Textbox(label="Abstract")
     df.select(fn=df_row_selected, outputs=[abstract_modal, title, abstract])
     inputs = [
         search_query,
         candidate_pool_size,
+        num_results,
         presentation_type,
         column_names,
     ]
         api_name=False,
     )
+    with gr.Row(visible=False):
+        demo_mcp.render()
 with demo.route("Open PR"):
     demo_pr.render()
 if __name__ == "__main__":
+    demo.launch(mcp_server=True)

app_mcp.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import gradio as gr
+import polars as pl
+from search import search
+from table import df_orig
+COLUMNS_MCP = [
+    "title",
+    "authors",
+    "abstract",
+    "openreview_url",
+    "arxiv_id",
+    "paper_page",
+    "space_ids",
+    "model_ids",
+    "dataset_ids",
+    "upvotes",
+    "num_comments",
+    "project_page",
+    "github",
+    "row_index",
+]
+DEFAULT_COLUMNS_MCP = [
+    "title",
+    "authors",
+    "abstract",
+    "openreview_url",
+    "arxiv_id",
+    "project_page",
+    "github",
+    "row_index",
+]
+df_mcp = df_orig.rename({"openreview": "openreview_url", "paper_id": "row_index"}).select(COLUMNS_MCP)
+def search_papers(
+    search_query: str,
+    candidate_pool_size: int,
+    num_results: int,
+    columns: list[str],
+) -> list[dict]:
+    """Searches ICLR 2025 papers relevant to a user query in English.
+    This function performs a semantic search over ICLR 2025 papers.
+    It uses a dual-stage retrieval process:
+    - First, it retrieves `candidate_pool_size` papers using dense vector similarity.
+    - Then, it re-ranks them with a cross-encoder model to select the top `num_results` most relevant papers.
+    - The search results are returned as a list of dictionaries.
+    Note:
+        The search query must be written in English. Queries in other languages are not supported.
+    Args:
+        search_query (str): The natural language query input by the user. Must be in English.
+        candidate_pool_size (int): Number of candidate papers to retrieve using the dense vector model.
+        num_results (int): Final number of top-ranked papers to return after re-ranking.
+        columns (list[str]): The columns to select from the DataFrame.
+    Returns:
+        list[dict]: A list of dictionaries of the top-ranked papers matching the query, sorted by relevance.
+    """
+    if not search_query:
+        raise ValueError("Search query cannot be empty")
+    if num_results > candidate_pool_size:
+        raise ValueError("Number of results must be less than or equal to candidate pool size")
+    df = df_mcp.clone()
+    results = search(search_query, candidate_pool_size, num_results)
+    df = pl.DataFrame(results).rename({"paper_id": "row_index"}).join(df, on="row_index", how="inner")
+    df = df.sort("ce_score", descending=True)
+    return df.select(columns).to_dicts()
+def get_metadata(row_index: int) -> dict:
+    """Returns a dictionary of metadata for a ICLR 2025 paper at the given table row index.
+    Args:
+        row_index (int): The index of the paper in the internal paper list table.
+    Returns:
+        dict: A dictionary containing metadata for the corresponding paper.
+    """
+    return df_mcp.filter(pl.col("row_index") == row_index).to_dicts()[0]
+def get_table(columns: list[str]) -> list[dict]:
+    """Returns a list of dictionaries of all ICLR 2025 papers.
+    Args:
+        columns (list[str]): The columns to select from the DataFrame.
+    Returns:
+        list[dict]: A list of dictionaries of all ICLR 2025 papers.
+    """
+    return df_mcp.select(columns).to_dicts()
+with gr.Blocks() as demo:
+    search_query = gr.Textbox(label="Search", submit_btn=True)
+    candidate_pool_size = gr.Slider(label="Candidate Pool Size", minimum=1, maximum=500, step=1, value=200)
+    num_results = gr.Slider(label="Number of Results", minimum=1, maximum=400, step=1, value=100)
+    column_names = gr.CheckboxGroup(label="Columns", choices=COLUMNS_MCP, value=DEFAULT_COLUMNS_MCP)
+    row_index = gr.Slider(label="Row Index", minimum=0, maximum=len(df_mcp) - 1, step=1, value=0)
+    out = gr.JSON()
+    search_papers_btn = gr.Button("Search Papers")
+    get_metadata_btn = gr.Button("Get Metadata")
+    get_table_btn = gr.Button("Get Table")
+    search_papers_btn.click(
+        fn=search_papers,
+        inputs=[search_query, candidate_pool_size, num_results, column_names],
+        outputs=out,
+    )
+    get_metadata_btn.click(
+        fn=get_metadata,
+        inputs=row_index,
+        outputs=out,
+    )
+    get_table_btn.click(
+        fn=get_table,
+        inputs=column_names,
+        outputs=out,
+    )
+if __name__ == "__main__":
+    demo.launch(mcp_server=True)

search.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import datasets
+import numpy as np
+import spaces
+from sentence_transformers import CrossEncoder, SentenceTransformer
+from table import BASE_REPO_ID
+ds = datasets.load_dataset(BASE_REPO_ID, split="train")
+ds.add_faiss_index(column="embedding")
+bi_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
+ce_model = CrossEncoder("BAAI/bge-reranker-base")
+@spaces.GPU(duration=10)
+def search(query: str, candidate_pool_size: int = 100, retrieval_k: int = 50) -> list[dict]:
+    prefix = "Represent this sentence for searching relevant passages: "
+    q_vec = bi_model.encode(prefix + query, normalize_embeddings=True)
+    _, retrieved_ds = ds.get_nearest_examples("embedding", q_vec, k=candidate_pool_size)
+    ce_inputs = [
+        (query, f"{retrieved_ds['title'][i]} {retrieved_ds['abstract'][i]}") for i in range(len(retrieved_ds["title"]))
+    ]
+    ce_scores = ce_model.predict(ce_inputs, batch_size=16)
+    sorted_idx = np.argsort(ce_scores)[::-1]
+    return [
+        {"paper_id": retrieved_ds["paper_id"][i], "ce_score": float(ce_scores[i])} for i in sorted_idx[:retrieval_k]
+    ]

semantic_search.py DELETED Viewed

@@ -1,41 +0,0 @@
-import datasets
-import numpy as np
-import scipy.spatial
-import scipy.special
-import spaces
-from sentence_transformers import CrossEncoder, SentenceTransformer
-from table import BASE_REPO_ID
-ds = datasets.load_dataset(BASE_REPO_ID, split="train")
-ds = ds.rename_column("submission_number", "paper_id")
-ds.add_faiss_index(column="embedding")
-model = SentenceTransformer("all-MiniLM-L6-v2")
-reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
-@spaces.GPU(duration=5)
-def semantic_search(
-    query: str, candidate_pool_size: int = 300, score_threshold: float = 0.5
-) -> tuple[list[int], list[float]]:
-    query_vec = model.encode(query)
-    _, retrieved_data = ds.get_nearest_examples("embedding", query_vec, k=candidate_pool_size)
-    rerank_inputs = [
-        [query, f"{title}\n{abstract}"]
-        for title, abstract in zip(retrieved_data["title"], retrieved_data["abstract"], strict=True)
-    ]
-    rerank_scores = reranker.predict(rerank_inputs)
-    sorted_indices = np.argsort(rerank_scores)[::-1]
-    paper_ids = []
-    scores = []
-    for i in sorted_indices:
-        score = float(scipy.special.expit(rerank_scores[i]))
-        if score < score_threshold:
-            break
-        paper_ids.append(retrieved_data["paper_id"][i])
-        scores.append(score)
-    return paper_ids, scores

table.py CHANGED Viewed

@@ -61,7 +61,7 @@ def format_author_claim_ratio(row: dict) -> str:
 df_orig = (
     datasets.load_dataset(BASE_REPO_ID, split="train")
     .to_polars()
-    .rename({"paper_url": "openreview", "submission_number": "paper_id"})
     .with_columns(
         pl.lit([], dtype=pl.List(pl.Utf8)).alias(col_name) for col_name in ["space_ids", "model_ids", "dataset_ids"]
     )

 df_orig = (
     datasets.load_dataset(BASE_REPO_ID, split="train")
     .to_polars()
+    .rename({"paper_url": "openreview"})
     .with_columns(
         pl.lit([], dtype=pl.List(pl.Utf8)).alias(col_name) for col_name in ["space_ids", "model_ids", "dataset_ids"]
     )