Spaces:

librarian-bots
/

new_hub_datasets

Running

App Files Files Community

davanstrien HF Staff commited on Dec 5, 2023

Commit

7f66f08

1 Parent(s): d102592

Refactor code to add createdAt field and filter

Browse files

Files changed (1) hide show

app.py +19 -12

app.py CHANGED Viewed

@@ -1,16 +1,17 @@
 import os
 from datetime import datetime, timedelta
 from typing import Any, Dict
 import gradio as gr
 import pandas as pd
 from diskcache import Cache
 from dotenv import load_dotenv
 from httpx import Client
 from huggingface_hub import DatasetCard, hf_hub_url, list_datasets
 from tqdm.auto import tqdm
 from tqdm.contrib.concurrent import thread_map
-from cachetools import TTLCache, cached
 load_dotenv()
@@ -42,19 +43,24 @@ client = Client(
 cache = TTLCache(maxsize=10, ttl=CACHE_TIME)
 def add_created_data(dataset):
     _id = dataset._id
-    created = datetime.fromtimestamp(int(_id[:8], 16))
     dataset_dict = dataset.__dict__
-    dataset_dict["created"] = created
     return dataset_dict
-def get_three_months_ago():
-    now = datetime.now()
-    return now - timedelta(days=90)
 def get_readme_len(dataset: Dict[str, Any]):
     try:
         url = hf_hub_url(dataset["id"], "README.md", repo_type="dataset")
@@ -110,7 +116,8 @@ def get_datasets():
 def load_data():
     datasets = get_datasets()
     datasets = [add_created_data(dataset) for dataset in tqdm(datasets)]
-    filtered = [ds for ds in datasets if ds["created"] > get_three_months_ago()]
     ds_with_len = thread_map(get_readme_len, filtered)
     ds_with_len = [ds for ds in ds_with_len if ds is not None]
     ds_with_valid_status = thread_map(has_server_preview, ds_with_len)
@@ -122,7 +129,7 @@ columns_to_drop = [
     "cardData",
     "gated",
     "sha",
-    "paperswithcode_id",
     "tags",
     "description",
     "siblings",
@@ -150,11 +157,11 @@ def prep_dataframe(remove_orgs_and_users=REMOVE_ORGS, columns_to_drop=columns_to
 def filter_df_by_max_age(df, max_age_days=None):
-    df = df.dropna(subset=["created"])
     now = datetime.now()
     if max_age_days is not None:
         max_date = now - timedelta(days=max_age_days)
-        df = df[df["created"] >= max_date]
     return df

 import os
 from datetime import datetime, timedelta
+from sys import platform
 from typing import Any, Dict
 import gradio as gr
 import pandas as pd
+from cachetools import TTLCache, cached
 from diskcache import Cache
 from dotenv import load_dotenv
 from httpx import Client
 from huggingface_hub import DatasetCard, hf_hub_url, list_datasets
 from tqdm.auto import tqdm
 from tqdm.contrib.concurrent import thread_map
 load_dotenv()
 cache = TTLCache(maxsize=10, ttl=CACHE_TIME)
+def get_three_months_ago():
+    now = datetime.now()
+    return now - timedelta(days=90)
+def parse_date(date_str):
+    # parse the created date from string 2023-11-17T16:39:54.000Z to datetime
+    return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%fZ")
 def add_created_data(dataset):
     _id = dataset._id
+    created = parse_date(dataset.createdAt)
     dataset_dict = dataset.__dict__
+    dataset_dict["createdAt"] = created
     return dataset_dict
 def get_readme_len(dataset: Dict[str, Any]):
     try:
         url = hf_hub_url(dataset["id"], "README.md", repo_type="dataset")
 def load_data():
     datasets = get_datasets()
     datasets = [add_created_data(dataset) for dataset in tqdm(datasets)]
+    # datasets = [dataset.__dict__ for dataset in tqdm(datasets)]
+    filtered = [ds for ds in datasets if ds["createdAt"] > get_three_months_ago()]
     ds_with_len = thread_map(get_readme_len, filtered)
     ds_with_len = [ds for ds in ds_with_len if ds is not None]
     ds_with_valid_status = thread_map(has_server_preview, ds_with_len)
     "cardData",
     "gated",
     "sha",
+    # "paperswithcode_id",
     "tags",
     "description",
     "siblings",
 def filter_df_by_max_age(df, max_age_days=None):
+    df = df.dropna(subset=["createdAt"])
     now = datetime.now()
     if max_age_days is not None:
         max_date = now - timedelta(days=max_age_days)
+        df = df[df["createdAt"] >= max_date]
     return df