Spaces:
Runtime error
Runtime error
Commit
Β·
479b4a3
1
Parent(s):
a0915df
add paper parsing
Browse files- app.py +34 -12
- requirements.in +2 -2
app.py
CHANGED
@@ -6,12 +6,13 @@ from collections import defaultdict
|
|
6 |
import gradio as gr
|
7 |
from cachetools import TTLCache, cached
|
8 |
from cytoolz import groupby
|
9 |
-
from huggingface_hub import get_collection, list_datasets,
|
|
|
10 |
from tqdm.auto import tqdm
|
11 |
|
12 |
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
13 |
is_macos = platform.system() == "Darwin"
|
14 |
-
LIMIT = None
|
15 |
CACHE_TIME = 60 * 5 # 5 minutes
|
16 |
|
17 |
|
@@ -34,13 +35,8 @@ def check_for_arxiv_id(model):
|
|
34 |
|
35 |
|
36 |
def extract_arxiv_id(input_string: str) -> str:
|
37 |
-
# Define the regular expression pattern
|
38 |
pattern = re.compile(r"\barxiv:(\d+\.\d+)\b")
|
39 |
-
|
40 |
-
# Search for the pattern in the input string
|
41 |
match = pattern.search(input_string)
|
42 |
-
|
43 |
-
# If a match is found, return the numeric part of the ARXIV ID, else return None
|
44 |
return match[1] if match else None
|
45 |
|
46 |
|
@@ -72,13 +68,17 @@ def create_dataset_to_arxiv_id_dict():
|
|
72 |
return dataset_to_arxiv_id
|
73 |
|
74 |
|
75 |
-
|
|
|
|
|
|
|
|
|
76 |
|
77 |
|
78 |
def group_collection_items(collection_slug: str):
|
79 |
collection = get_collection(collection_slug)
|
80 |
items = collection.items
|
81 |
-
return groupby(
|
82 |
|
83 |
|
84 |
def get_papers_for_collection(collection_slug: str):
|
@@ -87,8 +87,10 @@ def get_papers_for_collection(collection_slug: str):
|
|
87 |
collection = group_collection_items(collection_slug)
|
88 |
collection_datasets = collection.get("datasets", None)
|
89 |
collection_models = collection.get("models", None)
|
|
|
90 |
dataset_papers = defaultdict(dict)
|
91 |
model_papers = defaultdict(dict)
|
|
|
92 |
if collection_datasets is not None:
|
93 |
for dataset in collection_datasets:
|
94 |
if arxiv_ids := dataset_to_arxiv_id.get(dataset.item_id, None):
|
@@ -111,7 +113,24 @@ def get_papers_for_collection(collection_slug: str):
|
|
111 |
],
|
112 |
}
|
113 |
model_papers[model.item_id] = data
|
114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
|
117 |
placeholder_url = "HF-IA-archiving/models-to-archive-65006a7fdadb8c628f33aac9"
|
@@ -119,8 +138,10 @@ slug_input = gr.Textbox(
|
|
119 |
placeholder=placeholder_url, interactive=True, label="Collection slug", max_lines=1
|
120 |
)
|
121 |
description = (
|
122 |
-
"Enter a
|
123 |
-
" datasets in the collection."
|
|
|
|
|
124 |
)
|
125 |
|
126 |
examples = [
|
@@ -131,6 +152,7 @@ gr.Interface(
|
|
131 |
get_papers_for_collection,
|
132 |
slug_input,
|
133 |
"json",
|
|
|
134 |
description=description,
|
135 |
examples=examples,
|
136 |
cache_examples=True,
|
|
|
6 |
import gradio as gr
|
7 |
from cachetools import TTLCache, cached
|
8 |
from cytoolz import groupby
|
9 |
+
from huggingface_hub import (CollectionItem, get_collection, list_datasets,
|
10 |
+
list_models)
|
11 |
from tqdm.auto import tqdm
|
12 |
|
13 |
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
14 |
is_macos = platform.system() == "Darwin"
|
15 |
+
LIMIT = 1000 if is_macos else None # limit for local dev because slooow internet
|
16 |
CACHE_TIME = 60 * 5 # 5 minutes
|
17 |
|
18 |
|
|
|
35 |
|
36 |
|
37 |
def extract_arxiv_id(input_string: str) -> str:
|
|
|
38 |
pattern = re.compile(r"\barxiv:(\d+\.\d+)\b")
|
|
|
|
|
39 |
match = pattern.search(input_string)
|
|
|
|
|
40 |
return match[1] if match else None
|
41 |
|
42 |
|
|
|
68 |
return dataset_to_arxiv_id
|
69 |
|
70 |
|
71 |
+
def get_collection_type(collection_item: CollectionItem):
|
72 |
+
try:
|
73 |
+
return f"{collection_item.item_type}s"
|
74 |
+
except AttributeError:
|
75 |
+
return None
|
76 |
|
77 |
|
78 |
def group_collection_items(collection_slug: str):
|
79 |
collection = get_collection(collection_slug)
|
80 |
items = collection.items
|
81 |
+
return groupby(get_collection_type, items)
|
82 |
|
83 |
|
84 |
def get_papers_for_collection(collection_slug: str):
|
|
|
87 |
collection = group_collection_items(collection_slug)
|
88 |
collection_datasets = collection.get("datasets", None)
|
89 |
collection_models = collection.get("models", None)
|
90 |
+
papers = collection.get("papers", None)
|
91 |
dataset_papers = defaultdict(dict)
|
92 |
model_papers = defaultdict(dict)
|
93 |
+
collection_papers = defaultdict(dict)
|
94 |
if collection_datasets is not None:
|
95 |
for dataset in collection_datasets:
|
96 |
if arxiv_ids := dataset_to_arxiv_id.get(dataset.item_id, None):
|
|
|
113 |
],
|
114 |
}
|
115 |
model_papers[model.item_id] = data
|
116 |
+
if papers is not None:
|
117 |
+
for paper in papers:
|
118 |
+
data = {
|
119 |
+
"arxiv_ids": paper.item_id,
|
120 |
+
"hub_paper_links": [f"https://huggingface.co/papers/{paper.item_id}"],
|
121 |
+
}
|
122 |
+
collection_papers[paper.item_id] = data
|
123 |
+
if not dataset_papers:
|
124 |
+
dataset_papers = None
|
125 |
+
if not model_papers:
|
126 |
+
model_papers = None
|
127 |
+
if not collection_papers:
|
128 |
+
collection_papers = None
|
129 |
+
return {
|
130 |
+
"dataset papers": dataset_papers,
|
131 |
+
"model papers": model_papers,
|
132 |
+
"papers": collection_papers,
|
133 |
+
}
|
134 |
|
135 |
|
136 |
placeholder_url = "HF-IA-archiving/models-to-archive-65006a7fdadb8c628f33aac9"
|
|
|
138 |
placeholder=placeholder_url, interactive=True, label="Collection slug", max_lines=1
|
139 |
)
|
140 |
description = (
|
141 |
+
"Enter a Collection slug to get the ArXiv IDs and Hugging Face Paper links for"
|
142 |
+
" papers associated with models and datasets in the collection. If the collection"
|
143 |
+
" includes papers the ArXiv IDs and Hugging Face Paper links will be returned for"
|
144 |
+
" those papers as well."
|
145 |
)
|
146 |
|
147 |
examples = [
|
|
|
152 |
get_papers_for_collection,
|
153 |
slug_input,
|
154 |
"json",
|
155 |
+
title="ππ: Extract linked papers from a Hugging Face Collection",
|
156 |
description=description,
|
157 |
examples=examples,
|
158 |
cache_examples=True,
|
requirements.in
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
cachetools
|
|
|
2 |
git+https://github.com/huggingface/huggingface_hub
|
3 |
gradio
|
4 |
-
httpx
|
5 |
-
cytoolz
|
|
|
1 |
cachetools
|
2 |
+
cytoolz
|
3 |
git+https://github.com/huggingface/huggingface_hub
|
4 |
gradio
|
5 |
+
httpx
|
|