Spaces:
Sleeping
Sleeping
refresh
Browse files
main.py
CHANGED
@@ -66,7 +66,7 @@ def get_zotero_items(debug=False):
|
|
66 |
print(f"# items fetched {len(items)}")
|
67 |
|
68 |
if debug:
|
69 |
-
if len(items) >
|
70 |
break
|
71 |
|
72 |
return items
|
@@ -153,9 +153,7 @@ def parse_html_content(html):
|
|
153 |
|
154 |
# Extract paper title
|
155 |
try:
|
156 |
-
paper_title = soup.find("h1", class_="ltx_title ltx_title_document").get_text(
|
157 |
-
strip=True
|
158 |
-
)
|
159 |
except Exception:
|
160 |
paper_title = soup.find("title").get_text(strip=True)
|
161 |
paper_title = re.sub(r"^\[\d+\.\d+(v\d+)?\]\s*", "", paper_title)
|
@@ -170,9 +168,7 @@ def parse_html_content(html):
|
|
170 |
if abstract:
|
171 |
result.append(
|
172 |
{
|
173 |
-
"content": " ".join(
|
174 |
-
p.get_text(strip=True) for p in abstract.find_all("p")
|
175 |
-
).replace(")", ") "),
|
176 |
"title": "Abstract",
|
177 |
"paper_title": paper_title,
|
178 |
"content_type": "abstract",
|
@@ -182,11 +178,7 @@ def parse_html_content(html):
|
|
182 |
sections = soup.find_all("section", class_="ltx_section")
|
183 |
for index, section in enumerate(sections):
|
184 |
section_title = section.find("h2", class_="ltx_title ltx_title_section")
|
185 |
-
section_title = (
|
186 |
-
section_title.get_text(strip=True)
|
187 |
-
if section_title
|
188 |
-
else f"Section {index + 1}"
|
189 |
-
)
|
190 |
section_content = section.get_text(strip=True).replace(")", ") ")
|
191 |
|
192 |
content_type = "body"
|
@@ -281,9 +273,7 @@ def parse_markdown_content(md_content, arxiv_id):
|
|
281 |
"content": " ".join(content),
|
282 |
"title": current_title,
|
283 |
"paper_title": paper_title,
|
284 |
-
"content_type": get_content_type(
|
285 |
-
current_section, len(parsed)
|
286 |
-
),
|
287 |
"arxiv_id": arxiv_id,
|
288 |
}
|
289 |
)
|
@@ -393,13 +383,7 @@ def create_hf_image_dataset(base_dir):
|
|
393 |
|
394 |
# Add the data
|
395 |
data.append(
|
396 |
-
{
|
397 |
-
"image": image_path,
|
398 |
-
"arxiv_id": arxiv_id,
|
399 |
-
"page_number": page_number,
|
400 |
-
"width": width,
|
401 |
-
"height": height,
|
402 |
-
}
|
403 |
)
|
404 |
|
405 |
# Create the dataset
|
@@ -435,23 +419,24 @@ def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
|
|
435 |
)
|
436 |
|
437 |
# upload image dataset
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
repo_id, "processed_arxiv_ids", token=os.environ.get("HF_TOKEN")
|
454 |
-
|
|
|
455 |
|
456 |
|
457 |
########################################################
|
@@ -467,9 +452,7 @@ def main():
|
|
467 |
|
468 |
# get already processed arxiv ids from HF
|
469 |
try:
|
470 |
-
existing_arxiv_ids = load_dataset(HF_REPO_ID, "processed_arxiv_ids")["train"][
|
471 |
-
"arxiv_id"
|
472 |
-
]
|
473 |
except Exception as e:
|
474 |
print(e)
|
475 |
try:
|
@@ -481,9 +464,7 @@ def main():
|
|
481 |
print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")
|
482 |
|
483 |
# new arxiv items
|
484 |
-
arxiv_items = [
|
485 |
-
item for item in arxiv_items if item["arxiv_id"] not in existing_arxiv_ids
|
486 |
-
]
|
487 |
arxiv_items = fetch_arxiv_htmls(arxiv_items)
|
488 |
print(f"# of new arxiv items: {len(arxiv_items)}")
|
489 |
|
@@ -521,11 +502,7 @@ def main():
|
|
521 |
id_to_abstract[item["arxiv_id"]] = entry["content"]
|
522 |
break
|
523 |
print(f"# of abstracts: {len(id_to_abstract)}")
|
524 |
-
abstract_df = (
|
525 |
-
pd.Series(id_to_abstract)
|
526 |
-
.reset_index()
|
527 |
-
.rename(columns={"index": "arxiv_id", 0: "abstract"})
|
528 |
-
)
|
529 |
print(abstract_df.head())
|
530 |
|
531 |
# add to existing dataset
|
@@ -537,9 +514,7 @@ def main():
|
|
537 |
print(old_abstract_df.head())
|
538 |
|
539 |
abstract_df = pd.concat([old_abstract_df, abstract_df]).reset_index(drop=True)
|
540 |
-
abstract_df = abstract_df.drop_duplicates(
|
541 |
-
subset=["arxiv_id"], keep="last"
|
542 |
-
).reset_index(drop=True)
|
543 |
|
544 |
# contents
|
545 |
contents_df = pd.DataFrame(arxiv_items)
|
@@ -553,9 +528,7 @@ def main():
|
|
553 |
print(old_contents_df.sample().T)
|
554 |
|
555 |
contents_df = pd.concat([old_contents_df, contents_df]).reset_index(drop=True)
|
556 |
-
contents_df = contents_df.drop_duplicates(
|
557 |
-
subset=["arxiv_id"], keep="last"
|
558 |
-
).reset_index(drop=True)
|
559 |
|
560 |
# upload to hf
|
561 |
processed_arxiv_ids = list(set(processed_arxiv_ids + list(processed_arxiv_ids)))
|
|
|
66 |
print(f"# items fetched {len(items)}")
|
67 |
|
68 |
if debug:
|
69 |
+
if len(items) > 500:
|
70 |
break
|
71 |
|
72 |
return items
|
|
|
153 |
|
154 |
# Extract paper title
|
155 |
try:
|
156 |
+
paper_title = soup.find("h1", class_="ltx_title ltx_title_document").get_text(strip=True)
|
|
|
|
|
157 |
except Exception:
|
158 |
paper_title = soup.find("title").get_text(strip=True)
|
159 |
paper_title = re.sub(r"^\[\d+\.\d+(v\d+)?\]\s*", "", paper_title)
|
|
|
168 |
if abstract:
|
169 |
result.append(
|
170 |
{
|
171 |
+
"content": " ".join(p.get_text(strip=True) for p in abstract.find_all("p")).replace(")", ") "),
|
|
|
|
|
172 |
"title": "Abstract",
|
173 |
"paper_title": paper_title,
|
174 |
"content_type": "abstract",
|
|
|
178 |
sections = soup.find_all("section", class_="ltx_section")
|
179 |
for index, section in enumerate(sections):
|
180 |
section_title = section.find("h2", class_="ltx_title ltx_title_section")
|
181 |
+
section_title = section_title.get_text(strip=True) if section_title else f"Section {index + 1}"
|
|
|
|
|
|
|
|
|
182 |
section_content = section.get_text(strip=True).replace(")", ") ")
|
183 |
|
184 |
content_type = "body"
|
|
|
273 |
"content": " ".join(content),
|
274 |
"title": current_title,
|
275 |
"paper_title": paper_title,
|
276 |
+
"content_type": get_content_type(current_section, len(parsed)),
|
|
|
|
|
277 |
"arxiv_id": arxiv_id,
|
278 |
}
|
279 |
)
|
|
|
383 |
|
384 |
# Add the data
|
385 |
data.append(
|
386 |
+
{"image": image_path, "arxiv_id": arxiv_id, "page_number": page_number, "width": width, "height": height}
|
|
|
|
|
|
|
|
|
|
|
|
|
387 |
)
|
388 |
|
389 |
# Create the dataset
|
|
|
419 |
)
|
420 |
|
421 |
# upload image dataset
|
422 |
+
try:
|
423 |
+
img_ds = create_hf_image_dataset("data/arxiv_images")
|
424 |
+
img_ds.push_to_hub(repo_id, "images", token=os.environ.get("HF_TOKEN"))
|
425 |
+
|
426 |
+
# push id_to_abstract
|
427 |
+
abstract_ds = Dataset.from_pandas(abstract_df)
|
428 |
+
abstract_ds.push_to_hub(repo_id, "abstracts", token=os.environ.get("HF_TOKEN"))
|
429 |
+
|
430 |
+
# push arxiv_items
|
431 |
+
arxiv_ds = Dataset.from_pandas(contents_df)
|
432 |
+
arxiv_ds.push_to_hub(repo_id, "articles", token=os.environ.get("HF_TOKEN"))
|
433 |
+
|
434 |
+
# push processed_arxiv_ids
|
435 |
+
processed_arxiv_ids = [{"arxiv_id": arxiv_id} for arxiv_id in processed_arxiv_ids]
|
436 |
+
processed_arxiv_ids_ds = Dataset.from_list(processed_arxiv_ids)
|
437 |
+
processed_arxiv_ids_ds.push_to_hub(repo_id, "processed_arxiv_ids", token=os.environ.get("HF_TOKEN"))
|
438 |
+
except Exception as e:
|
439 |
+
print(e)
|
440 |
|
441 |
|
442 |
########################################################
|
|
|
452 |
|
453 |
# get already processed arxiv ids from HF
|
454 |
try:
|
455 |
+
existing_arxiv_ids = load_dataset(HF_REPO_ID, "processed_arxiv_ids")["train"]["arxiv_id"]
|
|
|
|
|
456 |
except Exception as e:
|
457 |
print(e)
|
458 |
try:
|
|
|
464 |
print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")
|
465 |
|
466 |
# new arxiv items
|
467 |
+
arxiv_items = [item for item in arxiv_items if item["arxiv_id"] not in existing_arxiv_ids]
|
|
|
|
|
468 |
arxiv_items = fetch_arxiv_htmls(arxiv_items)
|
469 |
print(f"# of new arxiv items: {len(arxiv_items)}")
|
470 |
|
|
|
502 |
id_to_abstract[item["arxiv_id"]] = entry["content"]
|
503 |
break
|
504 |
print(f"# of abstracts: {len(id_to_abstract)}")
|
505 |
+
abstract_df = pd.Series(id_to_abstract).reset_index().rename(columns={"index": "arxiv_id", 0: "abstract"})
|
|
|
|
|
|
|
|
|
506 |
print(abstract_df.head())
|
507 |
|
508 |
# add to existing dataset
|
|
|
514 |
print(old_abstract_df.head())
|
515 |
|
516 |
abstract_df = pd.concat([old_abstract_df, abstract_df]).reset_index(drop=True)
|
517 |
+
abstract_df = abstract_df.drop_duplicates(subset=["arxiv_id"], keep="last").reset_index(drop=True)
|
|
|
|
|
518 |
|
519 |
# contents
|
520 |
contents_df = pd.DataFrame(arxiv_items)
|
|
|
528 |
print(old_contents_df.sample().T)
|
529 |
|
530 |
contents_df = pd.concat([old_contents_df, contents_df]).reset_index(drop=True)
|
531 |
+
contents_df = contents_df.drop_duplicates(subset=["arxiv_id"], keep="last").reset_index(drop=True)
|
|
|
|
|
532 |
|
533 |
# upload to hf
|
534 |
processed_arxiv_ids = list(set(processed_arxiv_ids + list(processed_arxiv_ids)))
|