Spaces:

rahmanidashti
/

llm-as-a-rel

Sleeping

App Files Files Community

rahmanidashti commited on Nov 6, 2024

Commit

a77a138

1 Parent(s): 730a691

upgrade leaderboard

Browse files

Files changed (11) hide show

.gitignore +2 -0
README.md +4 -3
app.py +961 -193
requirements.txt +4 -1
src/about.py +2 -2
submissions/debug_submission_none/latest.json +6 -0
submissions/debug_submission_none/metadata_20241024_125801.json +21 -0
submissions/debug_submission_none/predictions_20241024_125801.csv +0 -0
utils/__init__.py +0 -0
utils/hub_storage.py +41 -0
utils/token_handler.py +75 -0

.gitignore CHANGED Viewed

@@ -11,3 +11,5 @@ eval-results/
 eval-queue-bk/
 eval-results-bk/
 logs/

 eval-queue-bk/
 eval-results-bk/
 logs/
+*.DS_Store

README.md CHANGED Viewed

@@ -1,13 +1,14 @@
 ---
-title: Llm As A Rel
 emoji: 🥇
 colorFrom: green
 colorTo: indigo
 sdk: gradio
 app_file: app.py
 pinned: true
-license: apache-2.0
-short_description: LLMs as Automatic Relevance Judgment
 ---
 # Start the configuration

 ---
+title: Stark Leaderboard
 emoji: 🥇
 colorFrom: green
 colorTo: indigo
 sdk: gradio
 app_file: app.py
 pinned: true
+license: mit
+short_description: leaderboard of Semi-structured Retrieval Benchmark (STaRK)
+hf_oauth: true
 ---
 # Start the configuration

app.py CHANGED Viewed

@@ -1,204 +1,972 @@
 import gradio as gr
-from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
-from apscheduler.schedulers.background import BackgroundScheduler
-from huggingface_hub import snapshot_download
-from src.about import (
-    CITATION_BUTTON_LABEL,
-    CITATION_BUTTON_TEXT,
-    EVALUATION_QUEUE_TEXT,
-    INTRODUCTION_TEXT,
-    LLM_BENCHMARKS_TEXT,
-    TITLE,
-)
-from src.display.css_html_js import custom_css
-from src.display.utils import (
-    BENCHMARK_COLS,
-    COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
-    AutoEvalColumn,
-    ModelType,
-    fields,
-    WeightType,
-    Precision
-)
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
-def restart_space():
-    API.restart_space(repo_id=REPO_ID)
-### Space initialisation
-try:
-    print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
 try:
-    print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-def init_leaderboard(dataframe):
-    if dataframe is None or dataframe.empty:
-        raise ValueError("Leaderboard DataFrame is empty or None.")
-    return Leaderboard(
-        value=dataframe,
-        datatype=[c.type for c in fields(AutoEvalColumn)],
-        select_columns=SelectColumns(
-            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
-            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
-            label="Select Columns to Display:",
-        ),
-        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
-        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
-        filter_columns=[
-            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
-            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
-            ColumnFilter(
-                AutoEvalColumn.params.name,
-                type="slider",
-                min=0.01,
-                max=150,
-                label="Select the number of parameters (B)",
-            ),
-            ColumnFilter(
-                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
-            ),
-        ],
-        bool_checkboxgroup_label="Hide models",
-        interactive=False,
     )
-demo = gr.Blocks(css=custom_css)
-with demo:
-    gr.HTML(TITLE)
-    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard = init_leaderboard(LEADERBOARD_DF)
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
-            with gr.Column():
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-            with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
-            with gr.Row():
-                with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
-                    )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
-                    )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
-            submission_result = gr.Markdown()
-            submit_button.click(
-                add_new_eval,
-                [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
-                ],
-                submission_result,
             )
     with gr.Row():
-        with gr.Accordion("📙 Citation", open=False):
-            citation_button = gr.Textbox(
-                value=CITATION_BUTTON_TEXT,
-                label=CITATION_BUTTON_LABEL,
-                lines=20,
-                elem_id="citation-button",
-                show_copy_button=True,
             )
-scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", seconds=1800)
-scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

 import gradio as gr
 import pandas as pd
+import numpy as np
+import os
+import re
+from datetime import datetime
+import json
+import torch
+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import smtplib
+from email.mime.multipart import MIMEMultipart
+from email.mime.text import MIMEText
+from huggingface_hub import HfApi
+import shutil
+import tempfile
+from stark_qa import load_qa
+from stark_qa.evaluator import Evaluator
+from utils.hub_storage import HubStorage
+from utils.token_handler import TokenHandler
+# Initialize storage once at startup
 try:
+    REPO_ID = "snap-stanford/stark-leaderboard"  # Replace with your space name
+    hub_storage = HubStorage(REPO_ID)
+except Exception as e:
+    raise RuntimeError(f"Failed to initialize HuggingFace Hub storage: {e}")
+def process_single_instance(args):
+    idx, eval_csv, qa_dataset, evaluator, eval_metrics = args
+    query, query_id, answer_ids, meta_info = qa_dataset[idx]
+    try:
+        pred_rank = eval_csv[eval_csv['query_id'] == query_id]['pred_rank'].item()
+    except IndexError:
+        raise IndexError(f'Error when processing query_id={query_id}, please make sure the predicted results exist for this query.')
+    except Exception as e:
+        raise RuntimeError(f'Unexpected error occurred while fetching prediction rank for query_id={query_id}: {e}')
+    if isinstance(pred_rank, str):
+        try:
+            pred_rank = eval(pred_rank)
+        except SyntaxError as e:
+            raise ValueError(f'Failed to parse pred_rank as a list for query_id={query_id}: {e}')
+    if not isinstance(pred_rank, list):
+        raise TypeError(f'Error when processing query_id={query_id}, expected pred_rank to be a list but got {type(pred_rank)}.')
+    pred_dict = {pred_rank[i]: -i for i in range(min(100, len(pred_rank)))}
+    answer_ids = torch.LongTensor(answer_ids)
+    result = evaluator.evaluate(pred_dict, answer_ids, metrics=eval_metrics)
+    result["idx"], result["query_id"] = idx, query_id
+    return result
+def compute_metrics(csv_path: str, dataset: str, split: str, num_workers: int = 4):
+    candidate_ids_dict = {
+        'amazon': [i for i in range(957192)],
+        'mag': [i for i in range(1172724, 1872968)],
+        'prime': [i for i in range(129375)]
+    }
+    try:
+        eval_csv = pd.read_csv(csv_path)
+        if 'query_id' not in eval_csv.columns:
+            raise ValueError('No `query_id` column found in the submitted csv.')
+        if 'pred_rank' not in eval_csv.columns:
+            raise ValueError('No `pred_rank` column found in the submitted csv.')
+        eval_csv = eval_csv[['query_id', 'pred_rank']]
+        if dataset not in candidate_ids_dict:
+            raise ValueError(f"Invalid dataset '{dataset}', expected one of {list(candidate_ids_dict.keys())}.")
+        if split not in ['test', 'test-0.1', 'human_generated_eval']:
+            raise ValueError(f"Invalid split '{split}', expected one of ['test', 'test-0.1', 'human_generated_eval'].")
+        evaluator = Evaluator(candidate_ids_dict[dataset])
+        eval_metrics = ['hit@1', 'hit@5', 'recall@20', 'mrr']
+        qa_dataset = load_qa(dataset, human_generated_eval=split == 'human_generated_eval')
+        split_idx = qa_dataset.get_idx_split()
+        all_indices = split_idx[split].tolist()
+        results_list = []
+        query_ids = []
+        # Prepare args for each worker
+        args = [(idx, eval_csv, qa_dataset, evaluator, eval_metrics) for idx in all_indices]
+        with ProcessPoolExecutor(max_workers=num_workers) as executor:
+            futures = [executor.submit(process_single_instance, arg) for arg in args]
+            for future in tqdm(as_completed(futures), total=len(futures)):
+                result = future.result()  # This will raise an error if the worker encountered one
+                results_list.append(result)
+                query_ids.append(result['query_id'])
+        # Concatenate results and compute final metrics
+        eval_csv = pd.concat([eval_csv, pd.DataFrame(results_list)], ignore_index=True)
+        final_results = {
+            metric: np.mean(eval_csv[eval_csv['query_id'].isin(query_ids)][metric]) for metric in eval_metrics
+        }
+        return final_results
+    except pd.errors.EmptyDataError:
+        return "Error: The CSV file is empty or could not be read. Please check the file and try again."
+    except FileNotFoundError:
+        return f"Error: The file {csv_path} could not be found. Please check the file path and try again."
+    except Exception as error:
+        return f"{error}"
+# Data dictionaries for leaderboard
+data_synthesized_full = {
+    'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2'],
+    'STARK-AMAZON_Hit@1': [44.94, 15.29, 30.96, 26.56, 39.16, 40.93, 21.74, 42.08, 40.07, 46.10],
+    'STARK-AMAZON_Hit@5': [67.42, 47.93, 51.06, 50.01, 62.73, 64.37, 41.65, 66.87, 64.98, 66.02],
+    'STARK-AMAZON_R@20': [53.77, 44.49, 41.95, 52.05, 53.29, 54.28, 33.22, 56.52, 55.12, 53.44],
+    'STARK-AMAZON_MRR': [55.30, 30.20, 40.66, 37.75, 50.35, 51.60, 31.47, 53.46, 51.55, 55.51],
+    'STARK-MAG_Hit@1': [25.85, 10.51, 21.96, 12.88, 29.08, 30.06, 18.01, 37.90, 25.92, 31.18],
+    'STARK-MAG_Hit@5': [45.25, 35.23, 36.50, 39.01, 49.61, 50.58, 34.85, 56.74, 50.43, 46.42],
+    'STARK-MAG_R@20': [45.69, 42.11, 35.32, 46.97, 48.36, 50.49, 35.46, 46.40, 50.80, 43.94],
+    'STARK-MAG_MRR': [34.91, 21.34, 29.14, 29.12, 38.62, 39.66, 26.10, 47.25, 36.94, 38.39],
+    'STARK-PRIME_Hit@1': [12.75, 4.46, 6.53, 8.85, 12.63, 10.85, 10.10, 15.57, 15.10, 11.75],
+    'STARK-PRIME_Hit@5': [27.92, 21.85, 15.67, 21.35, 31.49, 30.23, 22.49, 33.42, 33.56, 23.85],
+    'STARK-PRIME_R@20': [31.25, 30.13, 16.52, 29.63, 36.00, 37.83, 26.34, 39.09, 38.05, 25.04],
+    'STARK-PRIME_MRR': [19.84, 12.38, 11.05, 14.73, 21.41, 19.99, 16.12, 24.11, 23.49, 17.39]
+}
+data_synthesized_10 = {
+    'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2', 'Claude3 Reranker', 'GPT4 Reranker'],
+    'STARK-AMAZON_Hit@1': [42.68, 16.46, 30.09, 25.00, 39.02, 43.29, 18.90, 43.29, 40.85, 44.31, 45.49, 44.79],
+    'STARK-AMAZON_Hit@5': [67.07, 50.00, 49.27, 48.17, 64.02, 67.68, 37.80, 71.34, 62.80, 65.24, 71.13, 71.17],
+    'STARK-AMAZON_R@20': [54.48, 42.15, 41.91, 51.65, 49.30, 56.04, 34.73, 56.14, 52.47, 51.00, 53.77, 55.35],
+    'STARK-AMAZON_MRR': [54.02, 30.20, 39.30, 36.87, 50.32, 54.20, 28.76, 55.07, 51.54, 55.07, 55.91, 55.69],
+    'STARK-MAG_Hit@1': [27.81, 11.65, 22.89, 12.03, 28.20, 34.59, 19.17, 38.35, 25.56, 31.58, 36.54, 40.90],
+    'STARK-MAG_Hit@5': [45.48, 36.84, 37.26, 37.97, 52.63, 50.75, 33.46, 58.64, 50.37, 47.36, 53.17, 58.18],
+    'STARK-MAG_R@20': [44.59, 42.30, 44.16, 47.98, 49.25, 50.75, 29.85, 46.38, 53.03, 45.72, 48.36, 48.60],
+    'STARK-MAG_MRR': [35.97, 21.82, 30.00, 28.70, 38.55, 42.90, 26.06, 48.25, 36.82, 38.98, 44.15, 49.00],
+    'STARK-PRIME_Hit@1': [13.93, 5.00, 6.78, 7.14, 15.36, 12.14, 9.29, 16.79, 15.36, 15.00, 17.79, 18.28],
+    'STARK-PRIME_Hit@5': [31.07, 23.57, 16.15, 17.14, 31.07, 31.42, 20.7, 34.29, 32.86, 26.07, 36.90, 37.28],
+    'STARK-PRIME_R@20': [32.84, 30.50, 17.07, 32.95, 37.88, 37.34, 25.54, 41.11, 40.99, 27.78, 35.57, 34.05],
+    'STARK-PRIME_MRR': [21.68, 13.50, 11.42, 16.27, 23.50, 21.23, 15.00, 24.99, 23.70, 19.98, 26.27, 26.55]
+}
+data_human_generated = {
+    'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2', 'Claude3 Reranker', 'GPT4 Reranker'],
+    'STARK-AMAZON_Hit@1': [27.16, 16.05, 25.93, 22.22, 39.50, 35.80, 29.63, 40.74, 46.91, 33.33, 53.09, 50.62],
+    'STARK-AMAZON_Hit@5': [51.85, 39.51, 54.32, 49.38, 64.19, 62.96, 46.91, 71.60, 72.84, 55.56, 74.07, 75.31],
+    'STARK-AMAZON_R@20': [29.23, 15.23, 23.69, 21.54, 35.46, 33.01, 21.21, 36.30, 40.22, 29.03, 35.46, 35.46],
+    'STARK-AMAZON_MRR': [18.79, 27.21, 37.12, 31.33, 52.65, 47.84, 38.61, 53.21, 58.74, 43.77, 62.11, 61.06],
+    'STARK-MAG_Hit@1': [32.14, 4.72, 25.00, 20.24, 28.57, 22.62, 16.67, 34.52, 23.81, 33.33, 38.10, 36.90],
+    'STARK-MAG_Hit@5': [41.67, 9.52, 30.95, 26.19, 41.67, 36.90, 28.57, 44.04, 41.67, 36.90, 45.24, 46.43],
+    'STARK-MAG_R@20': [32.46, 25.00, 27.24, 28.76, 35.95, 32.44, 21.74, 34.57, 39.85, 30.50, 35.95, 35.95],
+    'STARK-MAG_MRR': [37.42, 7.90, 27.98, 25.53, 35.81, 29.68, 21.59, 38.72, 31.43, 35.97, 42.00, 40.65],
+    'STARK-PRIME_Hit@1': [22.45, 2.04, 7.14, 6.12, 17.35, 16.33, 9.18, 25.51, 24.49, 15.31, 28.57, 28.57],
+    'STARK-PRIME_Hit@5': [41.84, 9.18, 13.27, 13.27, 34.69, 32.65, 21.43, 41.84, 39.80, 26.53, 46.94, 44.90],
+    'STARK-PRIME_R@20': [42.32, 10.69, 11.72, 17.62, 41.09, 39.01, 26.77, 48.10, 47.21, 25.56, 41.61, 41.61],
+    'STARK-PRIME_MRR': [30.37, 7.05, 10.07, 9.39, 26.35, 24.33, 15.24, 34.28, 32.98, 19.67, 36.32, 34.82]
+}
+# Initialize DataFrames
+df_synthesized_full = pd.DataFrame(data_synthesized_full)
+df_synthesized_10 = pd.DataFrame(data_synthesized_10)
+df_human_generated = pd.DataFrame(data_human_generated)
+# Model type definitions
+model_types = {
+    'Sparse Retriever': ['BM25'],
+    'Small Dense Retrievers': ['DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)'],
+    'LLM-based Dense Retrievers': ['ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b'],
+    'Multivector Retrievers': ['multi-ada-002', 'ColBERTv2'],
+    'LLM Rerankers': ['Claude3 Reranker', 'GPT4 Reranker'],
+    'Others': []  # Will be populated dynamically with submitted models
+}
+# Submission form validation functions
+def validate_email(email_str):
+    """Validate email format(s)"""
+    emails = [e.strip() for e in email_str.split(';')]
+    email_pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
+    return all(email_pattern.match(email) for email in emails)
+def validate_github_url(url):
+    """Validate GitHub URL format"""
+    github_pattern = re.compile(
+        r'^https?:\/\/(?:www\.)?github\.com\/[\w-]+\/[\w.-]+\/?$'
     )
+    return bool(github_pattern.match(url))
+def validate_csv(file_obj):
+    """Validate CSV file format and content"""
+    try:
+        df = pd.read_csv(file_obj.name)
+        required_cols = ['query_id', 'pred_rank']
+        if not all(col in df.columns for col in required_cols):
+            return False, "CSV must contain 'query_id' and 'pred_rank' columns"
+        try:
+            first_rank = eval(df['pred_rank'].iloc[0]) if isinstance(df['pred_rank'].iloc[0], str) else df['pred_rank'].iloc[0]
+            if not isinstance(first_rank, list) or len(first_rank) < 20:
+                return False, "pred_rank must be a list with at least 20 candidates"
+        except:
+            return False, "Invalid pred_rank format"
+        return True, "Valid CSV file"
+    except Exception as e:
+        return False, f"Error processing CSV: {str(e)}"
+def sanitize_name(name):
+    """Sanitize name for file system use"""
+    return re.sub(r'[^a-zA-Z0-9]', '_', name)
+def read_json_from_hub(api: HfApi, repo_id: str, file_path: str) -> dict:
+    """
+    Read and parse JSON file from HuggingFace Hub.
+    Args:
+        api: HuggingFace API instance
+        repo_id: Repository ID
+        file_path: Path to file in repository
+    Returns:
+        dict: Parsed JSON content
+    """
+    try:
+        # Download the file content as bytes
+        content = api.hf_hub_download(
+            repo_id=repo_id,
+            filename=file_path,
+            repo_type="space"
+        )
+        # Read and parse JSON
+        with open(content, 'r') as f:
+            return json.load(f)
+    except Exception as e:
+        print(f"Error reading JSON file {file_path}: {str(e)}")
+        return None
+def scan_submissions_directory():
+    """
+    Scans the submissions directory and updates the model types dictionary
+    with submitted models.
+    """
+    try:
+        # Initialize HuggingFace API
+        api = HfApi()
+        # Track submissions for each split
+        submissions_by_split = {
+            'test': [],
+            'test-0.1': [],
+            'human_generated_eval': []
+        }
+        # Get all files from repository
+        try:
+            all_files = api.list_repo_files(
+                repo_id=REPO_ID,
+                repo_type="space"
             )
+            # Filter for files in submissions directory
+            repo_files = [f for f in all_files if f.startswith('submissions/')]
+        except Exception as e:
+            print(f"Error listing repository contents: {str(e)}")
+            return submissions_by_split
+        # Group files by team folders
+        folder_files = {}
+        for filepath in repo_files:
+            parts = filepath.split('/')
+            if len(parts) < 3:  # Need at least submissions/team_folder/file
+                continue
+            folder_name = parts[1]  # team_folder name
+            if folder_name not in folder_files:
+                folder_files[folder_name] = []
+            folder_files[folder_name].append(filepath)
+        # Process each team folder
+        for folder_name, files in folder_files.items():
+            try:
+                # Find latest.json in this folder
+                latest_file = next((f for f in files if f.endswith('latest.json')), None)
+                if not latest_file:
+                    print(f"No latest.json found in {folder_name}")
+                    continue
+                # Read latest.json
+                latest_info = read_json_from_hub(api, REPO_ID, latest_file)
+                if not latest_info:
+                    print(f"Failed to read latest.json for {folder_name}")
+                    continue
+                timestamp = latest_info.get('latest_submission')
+                if not timestamp:
+                    print(f"No timestamp found in latest.json for {folder_name}")
+                    continue
+                # Find metadata file for latest submission
+                metadata_file = next(
+                    (f for f in files if f.endswith(f'metadata_{timestamp}.json')),
+                    None
+                )
+                if not metadata_file:
+                    print(f"No matching metadata file found for {folder_name} timestamp {timestamp}")
+                    continue
+                # Read metadata file
+                submission_data = read_json_from_hub(api, REPO_ID, metadata_file)
+                if not submission_data:
+                    print(f"Failed to read metadata for {folder_name}")
+                    continue
+                if latest_info.get('status') != 'approved':
+                    print(f"Skipping unapproved submission in {folder_name}")
+                    continue
+                # Add to submissions by split
+                split = submission_data.get('Split')
+                if split in submissions_by_split:
+                    submissions_by_split[split].append(submission_data)
+                    # Update model types if necessary
+                    method_name = submission_data.get('Method Name')
+                    model_type = submission_data.get('Model Type', 'Others')
+                    # Add to model type if it's a new method
+                    method_exists = any(method_name in methods for methods in model_types.values())
+                    if not method_exists and model_type in model_types:
+                        model_types[model_type].append(method_name)
+            except Exception as e:
+                print(f"Error processing folder {folder_name}: {str(e)}")
+                continue
+        return submissions_by_split
+    except Exception as e:
+        print(f"Error scanning submissions directory: {str(e)}")
+        return None
+def initialize_leaderboard():
+    """
+    Initialize the leaderboard with baseline results and submitted results.
+    """
+    global df_synthesized_full, df_synthesized_10, df_human_generated
+    try:
+        # First, initialize with baseline results
+        df_synthesized_full = pd.DataFrame(data_synthesized_full)
+        df_synthesized_10 = pd.DataFrame(data_synthesized_10)
+        df_human_generated = pd.DataFrame(data_human_generated)
+        print("Initialized with baseline results")
+        # Then scan and add submitted results
+        submissions = scan_submissions_directory()
+        if submissions:
+            for split, split_submissions in submissions.items():
+                for submission in split_submissions:
+                    if submission.get('results'):  # Make sure we have results
+                        # Update appropriate DataFrame based on split
+                        if split == 'test':
+                            df_to_update = df_synthesized_full
+                        elif split == 'test-0.1':
+                            df_to_update = df_synthesized_10
+                        else:  # human_generated_eval
+                            df_to_update = df_human_generated
+                        # Prepare new row data
+                        new_row = {
+                            'Method': submission['Method Name'],
+                            f'STARK-{submission["Dataset"].upper()}_Hit@1': submission['results']['hit@1'],
+                            f'STARK-{submission["Dataset"].upper()}_Hit@5': submission['results']['hit@5'],
+                            f'STARK-{submission["Dataset"].upper()}_R@20': submission['results']['recall@20'],
+                            f'STARK-{submission["Dataset"].upper()}_MRR': submission['results']['mrr']
+                        }
+                        # Update existing row or add new one
+                        method_mask = df_to_update['Method'] == submission['Method Name']
+                        if method_mask.any():
+                            for col in new_row:
+                                df_to_update.loc[method_mask, col] = new_row[col]
+                        else:
+                            df_to_update.loc[len(df_to_update)] = new_row
+        print("Leaderboard initialization complete")
+    except Exception as e:
+        print(f"Error initializing leaderboard: {str(e)}")
+def get_file_content(file_path):
+    """
+    Helper function to safely read file content from HuggingFace repository
+    """
+    try:
+        api = HfApi()
+        content_path = api.hf_hub_download(
+            repo_id=REPO_ID,
+            filename=file_path,
+            repo_type="space"
+        )
+        with open(content_path, 'r') as f:
+            return f.read()
+    except Exception as e:
+        print(f"Error reading file {file_path}: {str(e)}")
+        return None
+def save_submission(submission_data, csv_file):
+    """
+    Save submission data and CSV file using model_name_team_name format
+    Args:
+        submission_data (dict): Metadata and results for the submission
+        csv_file: The uploaded CSV file object
+    """
+    # Create folder name from model name and team name
+    model_name_clean = sanitize_name(submission_data['Method Name'])
+    team_name_clean = sanitize_name(submission_data['Team Name'])
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    # Create folder name: model_name_team_name
+    folder_name = f"{model_name_clean}_{team_name_clean}"
+    submission_id = f"{folder_name}_{timestamp}"
+    # Create submission directory structure
+    base_dir = "submissions"
+    submission_dir = os.path.join(base_dir, folder_name)
+    os.makedirs(submission_dir, exist_ok=True)
+    # Save CSV file with timestamp to allow multiple submissions
+    csv_filename = f"predictions_{timestamp}.csv"
+    csv_path = os.path.join(submission_dir, csv_filename)
+    if hasattr(csv_file, 'name'):
+        with open(csv_file.name, 'rb') as source, open(csv_path, 'wb') as target:
+            target.write(source.read())
+    # Add file paths to submission data
+    submission_data.update({
+        "csv_path": csv_path,
+        "submission_id": submission_id,
+        "folder_name": folder_name
+    })
+    # Save metadata as JSON with timestamp
+    metadata_path = os.path.join(submission_dir, f"metadata_{timestamp}.json")
+    with open(metadata_path, 'w') as f:
+        json.dump(submission_data, f, indent=4)
+    # Update latest.json to track most recent submission
+    latest_path = os.path.join(submission_dir, "latest.json")
+    with open(latest_path, 'w') as f:
+        json.dump({
+            "latest_submission": timestamp,
+            "status": "pending_review",
+            "method_name": submission_data['Method Name']
+        }, f, indent=4)
+    return submission_id
+def update_leaderboard_data(submission_data):
+    """
+    Update leaderboard data with new submission results
+    Only uses model name in the displayed table
+    """
+    global df_synthesized_full, df_synthesized_10, df_human_generated
+    # Determine which DataFrame to update based on split
+    split_to_df = {
+        'test': df_synthesized_full,
+        'test-0.1': df_synthesized_10,
+        'human_generated_eval': df_human_generated
+    }
+    df_to_update = split_to_df[submission_data['Split']]
+    # Prepare new row data
+    new_row = {
+        'Method': submission_data['Method Name'],  # Only use method name in table
+        f'STARK-{submission_data["Dataset"].upper()}_Hit@1': submission_data['results']['hit@1'],
+        f'STARK-{submission_data["Dataset"].upper()}_Hit@5': submission_data['results']['hit@5'],
+        f'STARK-{submission_data["Dataset"].upper()}_R@20': submission_data['results']['recall@20'],
+        f'STARK-{submission_data["Dataset"].upper()}_MRR': submission_data['results']['mrr']
+    }
+    # Check if method already exists
+    method_mask = df_to_update['Method'] == submission_data['Method Name']
+    if method_mask.any():
+        # Update existing row
+        for col in new_row:
+            df_to_update.loc[method_mask, col] = new_row[col]
+    else:
+        # Add new row
+        df_to_update.loc[len(df_to_update)] = new_row
+# Function to get emails from meta_data
+def get_emails_from_metadata(meta_data):
+    """
+    Extracts emails from the meta_data dictionary.
+    Args:
+        meta_data (dict): The metadata dictionary that contains the 'Contact Email(s)' field.
+    Returns:
+        list: A list of email addresses.
+    """
+    return [email.strip() for email in meta_data.get("Contact Email(s)", "").split(";")]
+# Function to format meta_data as an HTML table (without Prediction CSV)
+def format_metadata_as_table(meta_data):
+    """
+    Formats metadata dictionary into an HTML table for the email.
+    Handles multiple contact emails separated by a semicolon.
+    Args:
+        meta_data (dict): Dictionary containing submission metadata.
+    Returns:
+        str: HTML string representing the metadata table.
+    """
+    table_rows = ""
+    for key, value in meta_data.items():
+        if key == "Contact Email(s)":
+            # Ensure that contact emails are split by semicolon
+            emails = value.split(';')
+            formatted_emails = "; ".join([email.strip() for email in emails])
+            table_rows += f"<tr><td><b>{key}</b></td><td>{formatted_emails}</td></tr>"
+        elif key != "Prediction CSV":  # Exclude the Prediction CSV field
+            table_rows += f"<tr><td><b>{key}</b></td><td>{value}</td></tr>"
+    table_html = f"""
+    <table border="1" cellpadding="5" cellspacing="0">
+        {table_rows}
+    </table>
+    """
+    return table_html
+# Function to get emails from meta_data
+def get_emails_from_metadata(meta_data):
+    """
+    Extracts emails from the meta_data dictionary.
+    Args:
+        meta_data (dict): The metadata dictionary that contains the 'Contact Email(s)' field.
+    Returns:
+        list: A list of email addresses.
+    """
+    return [email.strip() for email in meta_data.get("Contact Email(s)", "").split(";")]
+def format_evaluation_results(results):
+    """
+    Formats the evaluation results dictionary into a readable string.
+    Args:
+        results (dict): Dictionary containing evaluation metrics and their values.
+    Returns:
+        str: Formatted string of evaluation results.
+    """
+    result_lines = [f"{metric}: {value}" for metric, value in results.items()]
+    return "\n".join(result_lines)
+def get_model_type_for_method(method_name):
+    """
+    Find the model type category for a given method name.
+    Returns 'Others' if not found in predefined categories.
+    """
+    for type_name, methods in model_types.items():
+        if method_name in methods:
+            return type_name
+    return 'Others'
+def validate_model_type(method_name, selected_type):
+    """
+    Validate if the selected model type is appropriate for the method name.
+    Returns (is_valid, message).
+    """
+    # Check if method exists in any category
+    existing_type = None
+    for type_name, methods in model_types.items():
+        if method_name in methods:
+            existing_type = type_name
+            break
+    # If method exists, it must be submitted under its predefined category
+    if existing_type:
+        if existing_type != selected_type:
+            return False, f"This method name is already registered under '{existing_type}'. Please use the correct category."
+        return True, "Valid model type"
+    # For new methods, any category is valid
+    return True, "Valid model type"
+def process_submission(
+    method_name, team_name, dataset, split, contact_email,
+    code_repo, csv_file, model_description, hardware, paper_link, model_type
+):
+    """Process and validate submission"""
+    temp_files = []
+    try:
+        # Input validation
+        if not all([method_name, team_name, dataset, split, contact_email, code_repo, csv_file, model_type]):
+            return "Error: Please fill in all required fields"
+        # Validate model type
+        is_valid, message = validate_model_type(method_name, model_type)
+        if not is_valid:
+            return f"Error: {message}"
+        # Create metadata
+        meta_data = {
+            "Method Name": method_name,
+            "Team Name": team_name,
+            "Dataset": dataset,
+            "Split": split,
+            "Contact Email(s)": contact_email,
+            "Code Repository": code_repo,
+            "Model Description": model_description,
+            "Hardware": hardware,
+            "(Optional) Paper link": paper_link,
+            "Model Type": model_type
+        }
+        # Generate folder name and timestamp
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        folder_name = f"{sanitize_name(method_name)}_{sanitize_name(team_name)}"
+        # Process CSV file
+        temp_csv_path = None
+        if isinstance(csv_file, str):
+            temp_csv_path = csv_file
+        else:
+            temp_fd, temp_csv_path = tempfile.mkstemp(suffix='.csv')
+            temp_files.append(temp_csv_path)
+            os.close(temp_fd)
+            if hasattr(csv_file, 'name'):
+                shutil.copy2(csv_file.name, temp_csv_path)
+            else:
+                with open(temp_csv_path, 'wb') as temp_file:
+                    if hasattr(csv_file, 'seek'):
+                        csv_file.seek(0)
+                    if hasattr(csv_file, 'read'):
+                        shutil.copyfileobj(csv_file, temp_file)
+                    else:
+                        temp_file.write(csv_file)
+        if not os.path.exists(temp_csv_path):
+            raise FileNotFoundError(f"Failed to create temporary CSV file at {temp_csv_path}")
+        # Compute metrics
+        results = compute_metrics(
+            csv_path=temp_csv_path,
+            dataset=dataset.lower(),
+            split=split,
+            num_workers=4
+        )
+        if isinstance(results, str):
+            # send_error_notification(meta_data, results)
+            return f"Evaluation error: {results}"
+        # Process results
+        processed_results = {
+            "hit@1": round(results['hit@1'] * 100, 2),
+            "hit@5": round(results['hit@5'] * 100, 2),
+            "recall@20": round(results['recall@20'] * 100, 2),
+            "mrr": round(results['mrr'] * 100, 2)
+        }
+        # Save files to HuggingFace Hub
+        try:
+            # 1. Save CSV file
+            csv_filename = f"predictions_{timestamp}.csv"
+            csv_path_in_repo = f"submissions/{folder_name}/{csv_filename}"
+            hub_storage.save_to_hub(
+                file_content=temp_csv_path,
+                path_in_repo=csv_path_in_repo,
+                commit_message=f"Add submission: {method_name} by {team_name}"
+            )
+            # 2. Save metadata
+            submission_data = {
+                **meta_data,
+                "results": processed_results,
+                "status": "approved",  # or "pending_review"
+                "submission_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                "csv_path": csv_path_in_repo
+            }
+            metadata_fd, temp_metadata_path = tempfile.mkstemp(suffix='.json')
+            temp_files.append(temp_metadata_path)
+            os.close(metadata_fd)
+            with open(temp_metadata_path, 'w') as f:
+                json.dump(submission_data, f, indent=4)
+            metadata_path = f"submissions/{folder_name}/metadata_{timestamp}.json"
+            hub_storage.save_to_hub(
+                file_content=temp_metadata_path,
+                path_in_repo=metadata_path,
+                commit_message=f"Add metadata: {method_name} by {team_name}"
+            )
+            # 3. Create or update latest.json
+            latest_info = {
+                "latest_submission": timestamp,
+                "status": "approved",  # or "pending_review"
+                "method_name": method_name,
+                "team_name": team_name
+            }
+            latest_fd, temp_latest_path = tempfile.mkstemp(suffix='.json')
+            temp_files.append(temp_latest_path)
+            os.close(latest_fd)
+            with open(temp_latest_path, 'w') as f:
+                json.dump(latest_info, f, indent=4)
+            latest_path = f"submissions/{folder_name}/latest.json"
+            hub_storage.save_to_hub(
+                file_content=temp_latest_path,
+                path_in_repo=latest_path,
+                commit_message=f"Update latest submission info for {method_name}"
+            )
+        except Exception as e:
+            raise RuntimeError(f"Failed to save files to HuggingFace Hub: {str(e)}")
+        # Send confirmation email and update leaderboard data
+        # send_submission_confirmation(meta_data, processed_results)
+        update_leaderboard_data(submission_data)
+        # Return success message
+        return f"""
+        Submission successful!
+        Evaluation Results:
+        Hit@1: {processed_results['hit@1']:.2f}%
+        Hit@5: {processed_results['hit@5']:.2f}%
+        Recall@20: {processed_results['recall@20']:.2f}%
+        MRR: {processed_results['mrr']:.2f}%
+        Your submission has been saved and a confirmation email has been sent to {contact_email}.
+        Once approved, your results will appear in the leaderboard under: {method_name}
+        You can find your submission at:
+        https://huggingface.co/spaces/{REPO_ID}/tree/main/submissions/{folder_name}
+        Please refresh the page to see your submission in the leaderboard.
+        """
+    except Exception as e:
+        error_message = f"Error processing submission: {str(e)}"
+        # send_error_notification(meta_data, error_message)
+        return error_message
+    finally:
+        # Clean up temporary files
+        for temp_file in temp_files:
+            try:
+                if os.path.exists(temp_file):
+                    os.unlink(temp_file)
+            except Exception as e:
+                print(f"Warning: Failed to delete temporary file {temp_file}: {str(e)}")
+def filter_by_model_type(df, selected_types):
+    """
+    Filter DataFrame by selected model types, including submitted models.
+    """
+    if not selected_types:
+        return df.head(0)
+    # Get all models from selected types
+    selected_models = []
+    for type_name in selected_types:
+        selected_models.extend(model_types[type_name])
+    # Filter DataFrame to include only selected models
+    return df[df['Method'].isin(selected_models)]
+def format_dataframe(df, dataset):
+    columns = ['Method'] + [col for col in df.columns if dataset in col]
+    filtered_df = df[columns].copy()
+    filtered_df.columns = [col.split('_')[-1] if '_' in col else col for col in filtered_df.columns]
+    filtered_df = filtered_df.sort_values('MRR', ascending=False)
+    return filtered_df
+def update_tables(selected_types):
+    """
+    Update tables based on selected model types.
+    Include all models from selected categories.
+    """
+    if not selected_types:
+        return [df.head(0) for df in [df_synthesized_full, df_synthesized_10, df_human_generated]]
+    filtered_df_full = filter_by_model_type(df_synthesized_full, selected_types)
+    filtered_df_10 = filter_by_model_type(df_synthesized_10, selected_types)
+    filtered_df_human = filter_by_model_type(df_human_generated, selected_types)
+    outputs = []
+    for df in [filtered_df_full, filtered_df_10, filtered_df_human]:
+        for dataset in ['AMAZON', 'MAG', 'PRIME']:
+            outputs.append(format_dataframe(df, f"STARK-{dataset}"))
+    return outputs
+css = """
+table > thead {
+    white-space: normal
+}
+table {
+    --cell-width-1: 250px
+}
+table > tbody > tr > td:nth-child(2) > div {
+    overflow-x: auto
+}
+.tab-nav {
+    border-bottom: 1px solid rgba(255, 255, 255, 0.1);
+    margin-bottom: 1rem;
+}
+"""
+# Main application
+with gr.Blocks(css=css) as demo:
+    gr.Markdown("# Semi-structured Retrieval Benchmark (STaRK) Leaderboard")
+    gr.Markdown("Refer to the [STaRK paper](https://arxiv.org/pdf/2404.13207) for details on metrics, tasks and models.")
+    # Initialize leaderboard at startup
+    print("Starting leaderboard initialization...")
+    initialize_leaderboard()
+    print("Leaderboard initialization finished")
+    # Model type filter
+    model_type_filter = gr.CheckboxGroup(
+        choices=list(model_types.keys()),
+        value=list(model_types.keys()),
+        label="Model types",
+        interactive=True
+    )
+    # Initialize dataframes list
+    all_dfs = []
+    # Create nested tabs structure
+    with gr.Tabs() as outer_tabs:
+        with gr.TabItem("Synthesized (full)"):
+            with gr.Tabs() as inner_tabs1:
+                for dataset in ['AMAZON', 'MAG', 'PRIME']:
+                    with gr.TabItem(dataset):
+                        all_dfs.append(gr.DataFrame(interactive=False))
+        with gr.TabItem("Synthesized (10%)"):
+            with gr.Tabs() as inner_tabs2:
+                for dataset in ['AMAZON', 'MAG', 'PRIME']:
+                    with gr.TabItem(dataset):
+                        all_dfs.append(gr.DataFrame(interactive=False))
+        with gr.TabItem("Human-Generated"):
+            with gr.Tabs() as inner_tabs3:
+                for dataset in ['AMAZON', 'MAG', 'PRIME']:
+                    with gr.TabItem(dataset):
+                        all_dfs.append(gr.DataFrame(interactive=False))
+    # Submission section
+    gr.Markdown("---")
+    gr.Markdown("## Submit Your Results")
+    gr.Markdown("""
+    Submit your results to be included in the leaderboard. Please ensure your submission meets all requirements.
+    For questions, contact [email protected]. Detailed instructions can be referred at [submission instructions](https://docs.google.com/document/d/11coGjTmOEi9p9-PUq1oy0eTOj8f_8CVQhDl5_0FKT14/edit?usp=sharing).
+    """)
     with gr.Row():
+        with gr.Column():
+            method_name = gr.Textbox(
+                label="Method Name (max 25 chars)*",
+                placeholder="e.g., MyRetrievalModel-v1"
+            )
+            dataset = gr.Dropdown(
+                choices=["amazon", "mag", "prime"],
+                label="Dataset*",
+                value="amazon"
+            )
+            split = gr.Dropdown(
+                choices=["test", "test-0.1", "human_generated_eval"],
+                label="Split*",
+                value="test"
+            )
+            team_name = gr.Textbox(
+                label="Team Name (max 25 chars)*",
+                placeholder="e.g., Stanford NLP"
+            )
+            contact_email = gr.Textbox(
+                label="Contact Email(s)*",
+                placeholder="[email protected]; [email protected]"
+            )
+            model_type = gr.Dropdown(
+                choices=list(model_types.keys()),
+                label="Model Type*",
+                value="Others",
+                info="Select the appropriate category for your model"
             )
+        with gr.Column():
+            model_description = gr.Textbox(
+                label="Model Description*",
+                lines=3,
+                placeholder="Briefly describe how your retriever model works..."
+            )
+            code_repo = gr.Textbox(
+                label="Code Repository*",
+                placeholder="https://github.com/snap-stanford/stark-leaderboard"
+            )
+            hardware = gr.Textbox(
+                label="Hardware Specifications*",
+                placeholder="e.g., 4x NVIDIA A100 80GB"
+            )
+            csv_file = gr.File(
+                label="Prediction CSV*",
+                file_types=[".csv"],
+                type="filepath"
+            )
+            paper_link = gr.Textbox(
+                label="Paper Link (Optional)",
+                placeholder="https://arxiv.org/abs/..."
+            )
+    submit_btn = gr.Button("Submit", variant="primary")
+    result = gr.Textbox(label="Submission Status", interactive=False)
+    # Set up event handlers
+    model_type_filter.change(
+        update_tables,
+        inputs=[model_type_filter],
+        outputs=all_dfs
+    )
+    # Event handler for submission button
+    submit_btn.click(
+        fn=process_submission,
+        inputs=[
+            method_name, team_name, dataset, split, contact_email,
+            code_repo, csv_file, model_description, hardware, paper_link, model_type
+        ],
+        outputs=result
+    ).success(  # Add a success handler to update tables after successful submission
+        fn=update_tables,
+        inputs=[model_type_filter],
+        outputs=all_dfs
+    )
+    # Initial table update
+    demo.load(
+        update_tables,
+        inputs=[model_type_filter],
+        outputs=all_dfs
+    )
+# Launch the application
+demo.launch()

requirements.txt CHANGED Viewed

@@ -10,7 +10,10 @@ matplotlib
 numpy
 pandas
 python-dateutil
 tqdm
 transformers
 tokenizers>=0.15.0
-sentencepiece

 numpy
 pandas
 python-dateutil
+python-dotenv
 tqdm
 transformers
+torch
 tokenizers>=0.15.0
+sentencepiece
+stark_qa

src/about.py CHANGED Viewed

@@ -21,11 +21,11 @@ NUM_FEWSHOT = 0 # Change with your few shot
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">LLMJudge Leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-This is a leaderboard for LLMJudge challenge
 """
 # Which evaluations are you running? how can people reproduce what you have?

 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+Intro text
 """
 # Which evaluations are you running? how can people reproduce what you have?

submissions/debug_submission_none/latest.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "latest_submission": "20241024_125801",
+    "status": "approved",
+    "method_name": "debug-submission",
+    "team_name": "none"
+}

submissions/debug_submission_none/metadata_20241024_125801.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "Method Name": "debug-submission",
+    "Team Name": "none",
+    "Dataset": "mag",
+    "Split": "human_generated_eval",
+    "Contact Email(s)": "none",
+    "Code Repository": "none",
+    "Model Description": "none",
+    "Hardware": "none",
+    "(Optional) Paper link": "none",
+    "Model Type": "Others",
+    "results": {
+        "hit@1": 28.57,
+        "hit@5": 41.67,
+        "recall@20": 35.95,
+        "mrr": 35.94
+    },
+    "status": "approved",
+    "submission_date": "2024-10-24 12:58:41",
+    "csv_path": "submissions/debug_submission_none/predictions_20241024_125801.csv"
+}

submissions/debug_submission_none/predictions_20241024_125801.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

utils/__init__.py ADDED Viewed

File without changes

utils/hub_storage.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from pathlib import Path
+from huggingface_hub import HfApi
+from .token_handler import TokenHandler
+class HubStorage:
+    def __init__(self, repo_id):
+        self.repo_id = repo_id
+        self.api = HfApi()
+    def get_file_content(self, file_path):
+        """
+        Get content of a file from the repository
+        """
+        try:
+            content = self.api.hf_hub_download(
+                repo_id=self.repo_id,
+                repo_type="space",
+                filename=file_path,
+                text=True
+            )
+            return content
+        except Exception as e:
+            print(f"Error reading file {file_path}: {str(e)}")
+            return None
+    def save_to_hub(self, file_content, path_in_repo, commit_message):
+        """
+        Save a file to the hub
+        """
+        try:
+            self.api.upload_file(
+                path_or_fileobj=file_content,
+                path_in_repo=path_in_repo,
+                repo_id=self.repo_id,
+                repo_type="space",
+                commit_message=commit_message
+            )
+            return True
+        except Exception as e:
+            print(f"Error saving file to hub: {str(e)}")
+            return False

utils/token_handler.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import os
+from dotenv import load_dotenv
+from huggingface_hub import HfApi
+from pathlib import Path
+class TokenHandler:
+    def __init__(self):
+        # Load environment variables from .env file if it exists
+        self.load_environment()
+        self.token = self._get_token()
+        self.api = HfApi()
+    def load_environment(self):
+        """Load environment variables from .env file"""
+        env_path = Path('.env')
+        if env_path.exists():
+            load_dotenv(env_path)
+    def _get_token(self) -> str:
+        """Get HuggingFace token from environment variables"""
+        token = os.getenv("HF_TOKEN")
+        if not token:
+            raise EnvironmentError(
+                "HF_TOKEN not found in environment variables. "
+                "Please set it up using one of these methods:\n"
+                "1. Create a .env file with HF_TOKEN=your_token\n"
+                "2. Set environment variable HF_TOKEN=your_token\n"
+                "3. Add HF_TOKEN to your HuggingFace Space secrets"
+            )
+        return token
+    def verify_token(self) -> bool:
+        """Verify if the token is valid by making a test API call"""
+        try:
+            # Try to get user information using the token
+            self.api.whoami(token=self.token)
+            return True
+        except Exception as e:
+            print(f"Token verification failed: {e}")
+            return False
+    def get_verified_token(self) -> str:
+        """Get token and verify it's working"""
+        if not self.verify_token():
+            raise ValueError(
+                "Invalid or expired HuggingFace token. "
+                "Please check your token at https://huggingface.co/settings/tokens"
+            )
+        return self.token
+# Usage example
+def initialize_hf_token():
+    """Initialize and verify HuggingFace token"""
+    try:
+        handler = TokenHandler()
+        token = handler.get_verified_token()
+        print("✓ HuggingFace token successfully verified")
+        return token
+    except Exception as e:
+        print(f"✗ Error initializing HuggingFace token: {e}")
+        return None
+# Example of how to use in your main code
+if __name__ == "__main__":
+    # Create .env file if it doesn't exist
+    if not Path('.env').exists():
+        print("Creating .env file template...")
+        with open('.env', 'w') as f:
+            f.write("HF_TOKEN=your_token_here\n")
+        print("Please edit .env file and add your HuggingFace token")
+    # Initialize token
+    token = initialize_hf_token()
+    if token:
+        print("Ready to use HuggingFace API")