Spaces:
Running
Running
Commit
·
64cac75
1
Parent(s):
75c3b48
fixed download arxiv
Browse files
utils.py
CHANGED
@@ -8,6 +8,7 @@ import os
|
|
8 |
import shutil
|
9 |
import time
|
10 |
|
|
|
11 |
def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_results='10', folder_name='docs'):
|
12 |
'''
|
13 |
Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
|
@@ -48,8 +49,6 @@ def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_resul
|
|
48 |
pdf_years = []
|
49 |
|
50 |
for i, pdf in enumerate(pdf_entries):
|
51 |
-
# print(pdf.xpath('updated/text()')[0][:4])
|
52 |
-
# xpath return a list with every ocurrence of the html path. Since we're getting each entry individually, we'll take the first element to avoid an unecessary list
|
53 |
pdf_titles.append(re.sub('[^a-zA-Z0-9]', ' ', pdf.xpath("title/text()")[0]))
|
54 |
pdf_authors.append(pdf.xpath("author/name/text()"))
|
55 |
pdf_urls.append(pdf.xpath("link[@title='pdf']/@href")[0])
|
@@ -63,7 +62,6 @@ def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_resul
|
|
63 |
pdf_info=list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
|
64 |
|
65 |
# Check number of available files
|
66 |
-
# print('Requesting {max_results} files'.format(max_results=max_results))
|
67 |
if len(pdf_urls)<int(max_results):
|
68 |
matching_pdf_num=len(pdf_urls)
|
69 |
# print('Only {matching_pdf_num} files available'.format(matching_pdf_num=matching_pdf_num))
|
@@ -79,9 +77,11 @@ def download_pdf(pdf_info):
|
|
79 |
all_reference_text = []
|
80 |
for i,p in enumerate(stqdm(pdf_info, desc='Searching and downloading papers')):
|
81 |
pdf_title=p[0].replace(':','').replace('/','').replace('.','')
|
82 |
-
pdf_url=p[1] + '.full.pdf'
|
83 |
-
pdf_author=p[2]
|
84 |
pdf_category=p[3]
|
|
|
|
|
|
|
|
|
85 |
folder_name=p[4]
|
86 |
pdf_citation=p[5]
|
87 |
r = requests.get(pdf_url, allow_redirects=True)
|
|
|
8 |
import shutil
|
9 |
import time
|
10 |
|
11 |
+
|
12 |
def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_results='10', folder_name='docs'):
|
13 |
'''
|
14 |
Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
|
|
|
49 |
pdf_years = []
|
50 |
|
51 |
for i, pdf in enumerate(pdf_entries):
|
|
|
|
|
52 |
pdf_titles.append(re.sub('[^a-zA-Z0-9]', ' ', pdf.xpath("title/text()")[0]))
|
53 |
pdf_authors.append(pdf.xpath("author/name/text()"))
|
54 |
pdf_urls.append(pdf.xpath("link[@title='pdf']/@href")[0])
|
|
|
62 |
pdf_info=list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
|
63 |
|
64 |
# Check number of available files
|
|
|
65 |
if len(pdf_urls)<int(max_results):
|
66 |
matching_pdf_num=len(pdf_urls)
|
67 |
# print('Only {matching_pdf_num} files available'.format(matching_pdf_num=matching_pdf_num))
|
|
|
77 |
all_reference_text = []
|
78 |
for i,p in enumerate(stqdm(pdf_info, desc='Searching and downloading papers')):
|
79 |
pdf_title=p[0].replace(':','').replace('/','').replace('.','')
|
|
|
|
|
80 |
pdf_category=p[3]
|
81 |
+
pdf_url=p[1]
|
82 |
+
if pdf_category in ['medRxiv', 'bioRxiv']:
|
83 |
+
pdf_url += '.full.pdf'
|
84 |
+
pdf_author=p[2]
|
85 |
folder_name=p[4]
|
86 |
pdf_citation=p[5]
|
87 |
r = requests.get(pdf_url, allow_redirects=True)
|