Spaces:

librarian-bots
/

base_model_explorer

Running

App Files Files Community

davanstrien HF Staff commited on Sep 6, 2023

Commit

02b09bd

1 Parent(s): 36ed5fd

add filter option

Browse files

Files changed (1) hide show

app.py +66 -17

app.py CHANGED Viewed

@@ -1,9 +1,13 @@
 from huggingface_hub import list_models
-from cachetools import cached, TTLCache
-from toolz import groupby, valmap
 import gradio as gr
 from tqdm.auto import tqdm
 import pandas as pd
 @cached(TTLCache(maxsize=10, ttl=60 * 60 * 3))
@@ -24,12 +28,14 @@ def has_base_model_info(model):
 grouped_by_has_base_model_info = groupby(has_base_model_info, get_all_models())
-print(valmap(len, grouped_by_has_base_model_info))
-summary = f"""{len(grouped_by_has_base_model_info.get(True)):,} models have base model info.
             {len(grouped_by_has_base_model_info.get(False)):,} models don't have base model info.
             Currently {round(len(grouped_by_has_base_model_info.get(True))/len(get_all_models())*100,2)}% of models have base model info."""
 models_with_base_model_info = grouped_by_has_base_model_info.get(True)
 base_models = [
     model.cardData.get("base_model") for model in models_with_base_model_info
@@ -38,11 +44,18 @@ df = pd.DataFrame(
     pd.DataFrame({"base_model": base_models}).value_counts()
 ).reset_index()
 df_with_org = df.copy(deep=True)
 def parse_org(hub_id):
     parts = hub_id.split("/")
-    return parts[0] if len(parts) == 2 else "huggingface"
 df_with_org["org"] = df_with_org["base_model"].apply(parse_org)
@@ -70,6 +83,41 @@ def return_models_for_base_model(base_model):
     return results
 with gr.Blocks() as demo:
     gr.Markdown(
         "# Base model explorer: explore the lineage of models on the  &#129303; Hub"
@@ -78,23 +126,24 @@ with gr.Blocks() as demo:
         """When sharing models to the Hub it is possible to specify a base model in the model card i.e. that your model is a fine-tuned version of [bert-base-cased](https://huggingface.co/bert-base-cased).
         This Space allows you to find children models for a given base model and view the popularity of models for fine-tuning."""
     )
-    gr.Markdown(summary)
-    gr.Markdown("### Find all models trained from a base model")
     base_model = gr.Dropdown(all_base_models, label="Base Model")
     results = gr.Markdown()
     base_model.change(return_models_for_base_model, base_model, results)
     with gr.Accordion("Base model popularity ranking", open=False):
-        gr.DataFrame(df.head(50))
     with gr.Accordion("Base model popularity ranking by organization", open=False):
-        gr.DataFrame(
-            pd.DataFrame(
-                df_with_org.groupby("org")["count"]
-                .sum()
-                .sort_values(ascending=False)
-                .head(50)
-            )
-            .reset_index()
-            .sort_values("count", ascending=False)
         )

 from huggingface_hub import list_models
+from toolz import groupby
 import gradio as gr
 from tqdm.auto import tqdm
 import pandas as pd
+from cachetools import cached, TTLCache
+# from diskcache import Cache
+# cache = Cache("cache")
 @cached(TTLCache(maxsize=10, ttl=60 * 60 * 3))
 grouped_by_has_base_model_info = groupby(has_base_model_info, get_all_models())
+def produce_summary():
+    return f"""{len(grouped_by_has_base_model_info.get(True)):,} models have base model info.
             {len(grouped_by_has_base_model_info.get(False)):,} models don't have base model info.
             Currently {round(len(grouped_by_has_base_model_info.get(True))/len(get_all_models())*100,2)}% of models have base model info."""
 models_with_base_model_info = grouped_by_has_base_model_info.get(True)
 base_models = [
     model.cardData.get("base_model") for model in models_with_base_model_info
     pd.DataFrame({"base_model": base_models}).value_counts()
 ).reset_index()
 df_with_org = df.copy(deep=True)
+pipeline_tags = [x.pipeline_tag for x in models_with_base_model_info]
+unique_pipeline_tags = list(
+    {x.pipeline_tag for x in models_with_base_model_info if x.pipeline_tag is not None}
+)
 def parse_org(hub_id):
     parts = hub_id.split("/")
+    if len(parts) == 2:
+        return parts[0] if parts[0] != '.' else None
+    else:
+        return "huggingface"
 df_with_org["org"] = df_with_org["base_model"].apply(parse_org)
     return results
+def return_base_model_popularity(pipeline=None):
+    df_with_pipeline_info = (
+        pd.DataFrame({"base_model": base_models, "pipeline": pipeline_tags})
+        .value_counts()
+        .reset_index()
+    )
+    if pipeline is not None:
+        df_with_pipeline_info = df_with_pipeline_info[
+            df_with_pipeline_info["pipeline"] == pipeline
+        ]
+    keep_columns = ["base_model", "count"]
+    return df_with_pipeline_info[keep_columns].head(50)
+def return_base_model_popularity_by_org(pipeline=None):
+    df_with_pipeline_info = pd.DataFrame(
+        {"base_model": base_models, "pipeline": pipeline_tags}
+    )
+    df_with_pipeline_info["org"] = df_with_pipeline_info["base_model"].apply(parse_org)
+    df_with_pipeline_info = df_with_pipeline_info.dropna(subset=["org"])
+    df_with_org = df_with_pipeline_info.copy(deep=True)
+    if pipeline is not None:
+        df_with_org = df_with_pipeline_info[df_with_org["pipeline"] == pipeline]
+    df_with_org = df_with_org.drop(columns=["pipeline"])
+    df_with_org = pd.DataFrame(df_with_org.value_counts())
+    return pd.DataFrame(
+        df_with_org.groupby("org")["count"]
+        .sum()
+        .sort_values(ascending=False)
+        .reset_index()
+        .head(50)
+    )
 with gr.Blocks() as demo:
     gr.Markdown(
         "# Base model explorer: explore the lineage of models on the  &#129303; Hub"
         """When sharing models to the Hub it is possible to specify a base model in the model card i.e. that your model is a fine-tuned version of [bert-base-cased](https://huggingface.co/bert-base-cased).
         This Space allows you to find children models for a given base model and view the popularity of models for fine-tuning."""
     )
+    gr.Markdown(produce_summary())
+    gr.Markdown("## Find all models trained from a base model")
     base_model = gr.Dropdown(all_base_models, label="Base Model")
     results = gr.Markdown()
     base_model.change(return_models_for_base_model, base_model, results)
+    gr.Markdown("## Base model rankings ")
+    dropdown = gr.Dropdown(
+        choices=unique_pipeline_tags,
+        value=None,
+        label="Filter rankings by task pipeline",
+    )
     with gr.Accordion("Base model popularity ranking", open=False):
+        df_popularity = gr.DataFrame(return_base_model_popularity(None))
+        dropdown.change(return_base_model_popularity, dropdown, df_popularity)
     with gr.Accordion("Base model popularity ranking by organization", open=False):
+        df_popularity_org = gr.DataFrame(return_base_model_popularity_by_org(None))
+        dropdown.change(
+            return_base_model_popularity_by_org, dropdown, df_popularity_org
         )