rbiswasfc commited on
Commit
5520038
·
1 Parent(s): f0c7f30
Files changed (1) hide show
  1. main.py +29 -56
main.py CHANGED
@@ -66,7 +66,7 @@ def get_zotero_items(debug=False):
66
  print(f"# items fetched {len(items)}")
67
 
68
  if debug:
69
- if len(items) > 200:
70
  break
71
 
72
  return items
@@ -153,9 +153,7 @@ def parse_html_content(html):
153
 
154
  # Extract paper title
155
  try:
156
- paper_title = soup.find("h1", class_="ltx_title ltx_title_document").get_text(
157
- strip=True
158
- )
159
  except Exception:
160
  paper_title = soup.find("title").get_text(strip=True)
161
  paper_title = re.sub(r"^\[\d+\.\d+(v\d+)?\]\s*", "", paper_title)
@@ -170,9 +168,7 @@ def parse_html_content(html):
170
  if abstract:
171
  result.append(
172
  {
173
- "content": " ".join(
174
- p.get_text(strip=True) for p in abstract.find_all("p")
175
- ).replace(")", ") "),
176
  "title": "Abstract",
177
  "paper_title": paper_title,
178
  "content_type": "abstract",
@@ -182,11 +178,7 @@ def parse_html_content(html):
182
  sections = soup.find_all("section", class_="ltx_section")
183
  for index, section in enumerate(sections):
184
  section_title = section.find("h2", class_="ltx_title ltx_title_section")
185
- section_title = (
186
- section_title.get_text(strip=True)
187
- if section_title
188
- else f"Section {index + 1}"
189
- )
190
  section_content = section.get_text(strip=True).replace(")", ") ")
191
 
192
  content_type = "body"
@@ -281,9 +273,7 @@ def parse_markdown_content(md_content, arxiv_id):
281
  "content": " ".join(content),
282
  "title": current_title,
283
  "paper_title": paper_title,
284
- "content_type": get_content_type(
285
- current_section, len(parsed)
286
- ),
287
  "arxiv_id": arxiv_id,
288
  }
289
  )
@@ -393,13 +383,7 @@ def create_hf_image_dataset(base_dir):
393
 
394
  # Add the data
395
  data.append(
396
- {
397
- "image": image_path,
398
- "arxiv_id": arxiv_id,
399
- "page_number": page_number,
400
- "width": width,
401
- "height": height,
402
- }
403
  )
404
 
405
  # Create the dataset
@@ -435,23 +419,24 @@ def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
435
  )
436
 
437
  # upload image dataset
438
- img_ds = create_hf_image_dataset("data/arxiv_images")
439
- img_ds.push_to_hub(repo_id, "images", token=os.environ.get("HF_TOKEN"))
440
-
441
- # push id_to_abstract
442
- abstract_ds = Dataset.from_pandas(abstract_df)
443
- abstract_ds.push_to_hub(repo_id, "abstracts", token=os.environ.get("HF_TOKEN"))
444
-
445
- # push arxiv_items
446
- arxiv_ds = Dataset.from_pandas(contents_df)
447
- arxiv_ds.push_to_hub(repo_id, "articles", token=os.environ.get("HF_TOKEN"))
448
-
449
- # push processed_arxiv_ids
450
- processed_arxiv_ids = [{"arxiv_id": arxiv_id} for arxiv_id in processed_arxiv_ids]
451
- processed_arxiv_ids_ds = Dataset.from_list(processed_arxiv_ids)
452
- processed_arxiv_ids_ds.push_to_hub(
453
- repo_id, "processed_arxiv_ids", token=os.environ.get("HF_TOKEN")
454
- )
 
455
 
456
 
457
  ########################################################
@@ -467,9 +452,7 @@ def main():
467
 
468
  # get already processed arxiv ids from HF
469
  try:
470
- existing_arxiv_ids = load_dataset(HF_REPO_ID, "processed_arxiv_ids")["train"][
471
- "arxiv_id"
472
- ]
473
  except Exception as e:
474
  print(e)
475
  try:
@@ -481,9 +464,7 @@ def main():
481
  print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")
482
 
483
  # new arxiv items
484
- arxiv_items = [
485
- item for item in arxiv_items if item["arxiv_id"] not in existing_arxiv_ids
486
- ]
487
  arxiv_items = fetch_arxiv_htmls(arxiv_items)
488
  print(f"# of new arxiv items: {len(arxiv_items)}")
489
 
@@ -521,11 +502,7 @@ def main():
521
  id_to_abstract[item["arxiv_id"]] = entry["content"]
522
  break
523
  print(f"# of abstracts: {len(id_to_abstract)}")
524
- abstract_df = (
525
- pd.Series(id_to_abstract)
526
- .reset_index()
527
- .rename(columns={"index": "arxiv_id", 0: "abstract"})
528
- )
529
  print(abstract_df.head())
530
 
531
  # add to existing dataset
@@ -537,9 +514,7 @@ def main():
537
  print(old_abstract_df.head())
538
 
539
  abstract_df = pd.concat([old_abstract_df, abstract_df]).reset_index(drop=True)
540
- abstract_df = abstract_df.drop_duplicates(
541
- subset=["arxiv_id"], keep="last"
542
- ).reset_index(drop=True)
543
 
544
  # contents
545
  contents_df = pd.DataFrame(arxiv_items)
@@ -553,9 +528,7 @@ def main():
553
  print(old_contents_df.sample().T)
554
 
555
  contents_df = pd.concat([old_contents_df, contents_df]).reset_index(drop=True)
556
- contents_df = contents_df.drop_duplicates(
557
- subset=["arxiv_id"], keep="last"
558
- ).reset_index(drop=True)
559
 
560
  # upload to hf
561
  processed_arxiv_ids = list(set(processed_arxiv_ids + list(processed_arxiv_ids)))
 
66
  print(f"# items fetched {len(items)}")
67
 
68
  if debug:
69
+ if len(items) > 500:
70
  break
71
 
72
  return items
 
153
 
154
  # Extract paper title
155
  try:
156
+ paper_title = soup.find("h1", class_="ltx_title ltx_title_document").get_text(strip=True)
 
 
157
  except Exception:
158
  paper_title = soup.find("title").get_text(strip=True)
159
  paper_title = re.sub(r"^\[\d+\.\d+(v\d+)?\]\s*", "", paper_title)
 
168
  if abstract:
169
  result.append(
170
  {
171
+ "content": " ".join(p.get_text(strip=True) for p in abstract.find_all("p")).replace(")", ") "),
 
 
172
  "title": "Abstract",
173
  "paper_title": paper_title,
174
  "content_type": "abstract",
 
178
  sections = soup.find_all("section", class_="ltx_section")
179
  for index, section in enumerate(sections):
180
  section_title = section.find("h2", class_="ltx_title ltx_title_section")
181
+ section_title = section_title.get_text(strip=True) if section_title else f"Section {index + 1}"
 
 
 
 
182
  section_content = section.get_text(strip=True).replace(")", ") ")
183
 
184
  content_type = "body"
 
273
  "content": " ".join(content),
274
  "title": current_title,
275
  "paper_title": paper_title,
276
+ "content_type": get_content_type(current_section, len(parsed)),
 
 
277
  "arxiv_id": arxiv_id,
278
  }
279
  )
 
383
 
384
  # Add the data
385
  data.append(
386
+ {"image": image_path, "arxiv_id": arxiv_id, "page_number": page_number, "width": width, "height": height}
 
 
 
 
 
 
387
  )
388
 
389
  # Create the dataset
 
419
  )
420
 
421
  # upload image dataset
422
+ try:
423
+ img_ds = create_hf_image_dataset("data/arxiv_images")
424
+ img_ds.push_to_hub(repo_id, "images", token=os.environ.get("HF_TOKEN"))
425
+
426
+ # push id_to_abstract
427
+ abstract_ds = Dataset.from_pandas(abstract_df)
428
+ abstract_ds.push_to_hub(repo_id, "abstracts", token=os.environ.get("HF_TOKEN"))
429
+
430
+ # push arxiv_items
431
+ arxiv_ds = Dataset.from_pandas(contents_df)
432
+ arxiv_ds.push_to_hub(repo_id, "articles", token=os.environ.get("HF_TOKEN"))
433
+
434
+ # push processed_arxiv_ids
435
+ processed_arxiv_ids = [{"arxiv_id": arxiv_id} for arxiv_id in processed_arxiv_ids]
436
+ processed_arxiv_ids_ds = Dataset.from_list(processed_arxiv_ids)
437
+ processed_arxiv_ids_ds.push_to_hub(repo_id, "processed_arxiv_ids", token=os.environ.get("HF_TOKEN"))
438
+ except Exception as e:
439
+ print(e)
440
 
441
 
442
  ########################################################
 
452
 
453
  # get already processed arxiv ids from HF
454
  try:
455
+ existing_arxiv_ids = load_dataset(HF_REPO_ID, "processed_arxiv_ids")["train"]["arxiv_id"]
 
 
456
  except Exception as e:
457
  print(e)
458
  try:
 
464
  print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")
465
 
466
  # new arxiv items
467
+ arxiv_items = [item for item in arxiv_items if item["arxiv_id"] not in existing_arxiv_ids]
 
 
468
  arxiv_items = fetch_arxiv_htmls(arxiv_items)
469
  print(f"# of new arxiv items: {len(arxiv_items)}")
470
 
 
502
  id_to_abstract[item["arxiv_id"]] = entry["content"]
503
  break
504
  print(f"# of abstracts: {len(id_to_abstract)}")
505
+ abstract_df = pd.Series(id_to_abstract).reset_index().rename(columns={"index": "arxiv_id", 0: "abstract"})
 
 
 
 
506
  print(abstract_df.head())
507
 
508
  # add to existing dataset
 
514
  print(old_abstract_df.head())
515
 
516
  abstract_df = pd.concat([old_abstract_df, abstract_df]).reset_index(drop=True)
517
+ abstract_df = abstract_df.drop_duplicates(subset=["arxiv_id"], keep="last").reset_index(drop=True)
 
 
518
 
519
  # contents
520
  contents_df = pd.DataFrame(arxiv_items)
 
528
  print(old_contents_df.sample().T)
529
 
530
  contents_df = pd.concat([old_contents_df, contents_df]).reset_index(drop=True)
531
+ contents_df = contents_df.drop_duplicates(subset=["arxiv_id"], keep="last").reset_index(drop=True)
 
 
532
 
533
  # upload to hf
534
  processed_arxiv_ids = list(set(processed_arxiv_ids + list(processed_arxiv_ids)))