Commit
·
976f652
1
Parent(s):
ad38c8f
improvements
Browse files
app.py
CHANGED
@@ -40,7 +40,7 @@ def format_row_for_model(row):
|
|
40 |
int2label = {0: "new_dataset", 1: "not_new_dataset"}
|
41 |
|
42 |
|
43 |
-
def get_predictions(data: list[dict], model=None, batch_size=
|
44 |
if model is None:
|
45 |
model = load_model()
|
46 |
predictions = []
|
@@ -65,8 +65,8 @@ def create_markdown(row):
|
|
65 |
updated = updated.strftime("%Y-%m-%d")
|
66 |
broad_category = row["broad_category"]
|
67 |
category = row["category"]
|
68 |
-
return f""" <
|
69 |
-
|
|
70 |
\n\n{abstract}
|
71 |
\n\n [Hugging Face Papers page]({hub_paper_url})
|
72 |
"""
|
@@ -87,34 +87,82 @@ def prepare_data():
|
|
87 |
return df
|
88 |
|
89 |
|
90 |
-
all_possible_arxiv_categories = prepare_data().category.unique().tolist()
|
91 |
-
broad_categories = prepare_data().broad_category.unique().tolist()
|
92 |
|
93 |
|
94 |
-
def create_markdown_summary(categories=
|
95 |
df = prepare_data()
|
96 |
-
if
|
|
|
|
|
|
|
|
|
97 |
df = df[df["broad_category"].isin(categories)]
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
|
101 |
scheduler = BackgroundScheduler()
|
102 |
scheduler.add_job(prepare_data, "cron", hour=3, minute=30)
|
103 |
scheduler.start()
|
104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
with gr.Blocks() as demo:
|
106 |
-
gr.Markdown("## New Datasets in Machine Learning")
|
107 |
gr.Markdown(
|
108 |
-
"
|
109 |
-
"
|
110 |
-
)
|
111 |
-
broad_categories = gr.Dropdown(
|
112 |
-
choices=broad_categories,
|
113 |
-
label="Categories",
|
114 |
-
multiselect=True,
|
115 |
-
value=broad_categories,
|
116 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
results = gr.Markdown(create_markdown_summary())
|
118 |
-
broad_categories.change(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
demo.launch()
|
|
|
40 |
int2label = {0: "new_dataset", 1: "not_new_dataset"}
|
41 |
|
42 |
|
43 |
+
def get_predictions(data: list[dict], model=None, batch_size=64):
|
44 |
if model is None:
|
45 |
model = load_model()
|
46 |
predictions = []
|
|
|
65 |
updated = updated.strftime("%Y-%m-%d")
|
66 |
broad_category = row["broad_category"]
|
67 |
category = row["category"]
|
68 |
+
return f""" <h2> {title} </h2> Updated: {updated}
|
69 |
+
| Category: {broad_category} | Subcategory: {category} |
|
70 |
\n\n{abstract}
|
71 |
\n\n [Hugging Face Papers page]({hub_paper_url})
|
72 |
"""
|
|
|
87 |
return df
|
88 |
|
89 |
|
90 |
+
all_possible_arxiv_categories = sorted(prepare_data().category.unique().tolist())
|
91 |
+
broad_categories = sorted(prepare_data().broad_category.unique().tolist())
|
92 |
|
93 |
|
94 |
+
def create_markdown_summary(categories=None, new_only=True, narrow_categories=None):
|
95 |
df = prepare_data()
|
96 |
+
if new_only:
|
97 |
+
df = df[df["prediction"] == "new_dataset"]
|
98 |
+
if narrow_categories is not None:
|
99 |
+
df = df[df["category"].isin(narrow_categories)]
|
100 |
+
if categories is not None and not narrow_categories:
|
101 |
df = df[df["broad_category"].isin(categories)]
|
102 |
+
number_of_results = len(df)
|
103 |
+
results = (
|
104 |
+
"<h1 style='text-align: center'> arXiv papers related to datasets</h1> \n\n"
|
105 |
+
)
|
106 |
+
results += f"Number of results: {number_of_results}\n\n"
|
107 |
+
results += "\n\n<br>".join(df["markdown"].tolist())
|
108 |
+
return results
|
109 |
|
110 |
|
111 |
scheduler = BackgroundScheduler()
|
112 |
scheduler.add_job(prepare_data, "cron", hour=3, minute=30)
|
113 |
scheduler.start()
|
114 |
|
115 |
+
description = """This Space shows recent papers on arXiv that are *likely* to be papers introducing new datasets related to machine learning. \n\n
|
116 |
+
The Space works by:
|
117 |
+
- searching for papers on arXiv with the term `dataset` in the title + "machine learning" in the abstract
|
118 |
+
- passing the abstract and title of the papers to a machine learning model that predicts if the paper is introducing a new dataset or not
|
119 |
+
|
120 |
+
This Space is a WIP in progress. The model is not perfect, and the search query is not perfect. If you have suggestions for how to improve this Space, please open a Discussion.\n\n"""
|
121 |
+
|
122 |
+
|
123 |
with gr.Blocks() as demo:
|
|
|
124 |
gr.Markdown(
|
125 |
+
"<h1 style='text-align: center'> ✨New Datasets in Machine Learning "
|
126 |
+
" ✨ </h1>"
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
)
|
128 |
+
gr.Markdown(description)
|
129 |
+
with gr.Row():
|
130 |
+
broad_categories = gr.Dropdown(
|
131 |
+
choices=broad_categories,
|
132 |
+
label="Broad arXiv Category",
|
133 |
+
multiselect=True,
|
134 |
+
value="cs",
|
135 |
+
size="sm",
|
136 |
+
)
|
137 |
+
with gr.Accordion("Advanced Options", open=False):
|
138 |
+
gr.Markdown(
|
139 |
+
"Narrow by arXiv categories. **Note** this will take precedence over the"
|
140 |
+
" broad category selection."
|
141 |
+
)
|
142 |
+
narrow_categories = gr.Dropdown(
|
143 |
+
choices=all_possible_arxiv_categories,
|
144 |
+
value=None,
|
145 |
+
multiselect=True,
|
146 |
+
label="Narrow arXiv Category",
|
147 |
+
)
|
148 |
+
gr.ClearButton(narrow_categories, "Clear Narrow Categories", size="sm")
|
149 |
+
with gr.Row():
|
150 |
+
new_only = gr.Checkbox(True, label="New Datasets Only", size="sm")
|
151 |
results = gr.Markdown(create_markdown_summary())
|
152 |
+
broad_categories.change(
|
153 |
+
create_markdown_summary,
|
154 |
+
inputs=[broad_categories, new_only, narrow_categories],
|
155 |
+
outputs=results,
|
156 |
+
)
|
157 |
+
narrow_categories.change(
|
158 |
+
create_markdown_summary,
|
159 |
+
inputs=[broad_categories, new_only, narrow_categories],
|
160 |
+
outputs=results,
|
161 |
+
)
|
162 |
+
new_only.select(
|
163 |
+
create_markdown_summary,
|
164 |
+
[broad_categories, new_only, narrow_categories],
|
165 |
+
results,
|
166 |
+
)
|
167 |
|
168 |
demo.launch()
|