hysts HF staff commited on
Commit
5c4c264
·
1 Parent(s): 9db7887
Files changed (3) hide show
  1. app.py +1 -1
  2. papers.py +12 -21
  3. requirements.txt +0 -1
app.py CHANGED
@@ -24,7 +24,7 @@ with gr.Blocks(css="style.css") as demo:
24
  with gr.Row():
25
  start_date = Calendar(label="Start date", type="datetime", value="2023-05-05")
26
  end_date = Calendar(label="End date", type="datetime")
27
- search_title = gr.Textbox(label="Search by title")
28
 
29
  num_papers = gr.Textbox(label="Number of papers", value=update_num_papers(paper_list.df_raw), interactive=False)
30
  df = gr.Dataframe(
 
24
  with gr.Row():
25
  start_date = Calendar(label="Start date", type="datetime", value="2023-05-05")
26
  end_date = Calendar(label="End date", type="datetime")
27
+ search_title = gr.Textbox(label="Search title")
28
 
29
  num_papers = gr.Textbox(label="Number of papers", value=update_num_papers(paper_list.df_raw), interactive=False)
30
  df = gr.Dataframe(
papers.py CHANGED
@@ -4,7 +4,6 @@ import operator
4
 
5
  import datasets
6
  import pandas as pd
7
- import requests
8
  import tqdm.auto
9
 
10
 
@@ -16,31 +15,19 @@ class PaperInfo:
16
  title: str
17
  paper_page: str
18
  upvotes: int
19
- published_at: str
20
-
21
- def __post_init__(self):
22
- object.__setattr__(self, "published_at", PaperInfo.convert_timestamp(self.published_at))
23
-
24
- @staticmethod
25
- def convert_timestamp(timestamp: str) -> str:
26
- try:
27
- return datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ").strftime("%Y/%m/%d %H:%M:%S")
28
- except ValueError:
29
- return timestamp
30
 
31
 
32
  def get_df() -> pd.DataFrame:
33
- df = datasets.load_dataset("hysts-bot-data/daily-papers")["train"].to_pandas()
34
- df = df.drop(columns=["title"])
 
 
 
35
  paper_info = []
36
  for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
37
- res = requests.get(f"https://huggingface.co/api/papers/{row.arxiv_id}").json()
38
  info = PaperInfo(
39
  **row,
40
- title=res["title"],
41
  paper_page=f"https://huggingface.co/papers/{row.arxiv_id}",
42
- upvotes=res["upvotes"],
43
- published_at=res["publishedAt"],
44
  )
45
  paper_info.append(info)
46
  return pd.DataFrame([dataclasses.asdict(info) for info in paper_info])
@@ -65,7 +52,6 @@ class Prettifier:
65
  return f'<div class="{class_name}">{text}</div>'
66
 
67
  def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
68
- df = df.sort_values("arxiv_id", ascending=False).reset_index(drop=True)
69
  new_rows = []
70
  for _, row in df.iterrows():
71
  new_row = dict(row) | {
@@ -99,7 +85,12 @@ class PaperList:
99
  def column_datatype(self):
100
  return list(map(operator.itemgetter(1), self.COLUMN_INFO))
101
 
102
- def search(self, start_date: datetime.datetime, end_date: datetime.datetime, title: str) -> pd.DataFrame:
 
 
 
 
 
103
  df = self.df_raw.copy()
104
  df["date"] = pd.to_datetime(df["date"])
105
 
@@ -108,7 +99,7 @@ class PaperList:
108
  df["date"] = df["date"].dt.strftime("%Y-%m-%d")
109
 
110
  # Filter by title
111
- df = df[df["title"].str.contains(title, case=False)]
112
 
113
  df_prettified = self._prettifier(df).loc[:, self.column_names]
114
  return df_prettified
 
4
 
5
  import datasets
6
  import pandas as pd
 
7
  import tqdm.auto
8
 
9
 
 
15
  title: str
16
  paper_page: str
17
  upvotes: int
 
 
 
 
 
 
 
 
 
 
 
18
 
19
 
20
  def get_df() -> pd.DataFrame:
21
+ df = pd.merge(
22
+ left=datasets.load_dataset("hysts-bot-data/daily-papers")["train"].to_pandas(),
23
+ right=datasets.load_dataset("hysts-bot-data/daily-papers-upvotes")["train"].to_pandas(),
24
+ on="arxiv_id",
25
+ )
26
  paper_info = []
27
  for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
 
28
  info = PaperInfo(
29
  **row,
 
30
  paper_page=f"https://huggingface.co/papers/{row.arxiv_id}",
 
 
31
  )
32
  paper_info.append(info)
33
  return pd.DataFrame([dataclasses.asdict(info) for info in paper_info])
 
52
  return f'<div class="{class_name}">{text}</div>'
53
 
54
  def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
 
55
  new_rows = []
56
  for _, row in df.iterrows():
57
  new_row = dict(row) | {
 
85
  def column_datatype(self):
86
  return list(map(operator.itemgetter(1), self.COLUMN_INFO))
87
 
88
+ def search(
89
+ self,
90
+ start_date: datetime.datetime,
91
+ end_date: datetime.datetime,
92
+ title_search_query: str,
93
+ ) -> pd.DataFrame:
94
  df = self.df_raw.copy()
95
  df["date"] = pd.to_datetime(df["date"])
96
 
 
99
  df["date"] = df["date"].dt.strftime("%Y-%m-%d")
100
 
101
  # Filter by title
102
+ df = df[df["title"].str.contains(title_search_query, case=False)]
103
 
104
  df_prettified = self._prettifier(df).loc[:, self.column_names]
105
  return df_prettified
requirements.txt CHANGED
@@ -3,5 +3,4 @@ gradio==4.21.0
3
  gradio_calendar==0.0.4
4
  huggingface_hub==0.21.4
5
  pandas==2.2.0
6
- requests==2.31.0
7
  tqdm==4.66.1
 
3
  gradio_calendar==0.0.4
4
  huggingface_hub==0.21.4
5
  pandas==2.2.0
 
6
  tqdm==4.66.1