Spaces:

mehradans92
/

decode-elm

Running

App Files Files Community

mehradans92 commited on Feb 17, 2023

Commit

64cac75

1 Parent(s): 75c3b48

fixed download arxiv

Browse files

Files changed (1) hide show

utils.py +5 -5

utils.py CHANGED Viewed

@@ -8,6 +8,7 @@ import os
 import shutil
 import time
 def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_results='10', folder_name='docs'):
     '''
       Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
@@ -48,8 +49,6 @@ def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_resul
     pdf_years = []
     for i, pdf in enumerate(pdf_entries):
-      # print(pdf.xpath('updated/text()')[0][:4])
-      # xpath return a list with every ocurrence of the html path. Since we're getting each entry individually, we'll take the first element to avoid an unecessary list
       pdf_titles.append(re.sub('[^a-zA-Z0-9]', ' ', pdf.xpath("title/text()")[0]))
       pdf_authors.append(pdf.xpath("author/name/text()"))
       pdf_urls.append(pdf.xpath("link[@title='pdf']/@href")[0])
@@ -63,7 +62,6 @@ def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_resul
     pdf_info=list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
     # Check number of available files
-    # print('Requesting {max_results} files'.format(max_results=max_results))
     if len(pdf_urls)<int(max_results):
         matching_pdf_num=len(pdf_urls)
         # print('Only {matching_pdf_num} files available'.format(matching_pdf_num=matching_pdf_num))
@@ -79,9 +77,11 @@ def download_pdf(pdf_info):
     all_reference_text = []
     for i,p in enumerate(stqdm(pdf_info, desc='Searching and downloading papers')):
         pdf_title=p[0].replace(':','').replace('/','').replace('.','')
-        pdf_url=p[1] + '.full.pdf'
-        pdf_author=p[2]
         pdf_category=p[3]
         folder_name=p[4]
         pdf_citation=p[5]
         r = requests.get(pdf_url, allow_redirects=True)

 import shutil
 import time
 def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_results='10', folder_name='docs'):
     '''
       Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
     pdf_years = []
     for i, pdf in enumerate(pdf_entries):
       pdf_titles.append(re.sub('[^a-zA-Z0-9]', ' ', pdf.xpath("title/text()")[0]))
       pdf_authors.append(pdf.xpath("author/name/text()"))
       pdf_urls.append(pdf.xpath("link[@title='pdf']/@href")[0])
     pdf_info=list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
     # Check number of available files
     if len(pdf_urls)<int(max_results):
         matching_pdf_num=len(pdf_urls)
         # print('Only {matching_pdf_num} files available'.format(matching_pdf_num=matching_pdf_num))
     all_reference_text = []
     for i,p in enumerate(stqdm(pdf_info, desc='Searching and downloading papers')):
         pdf_title=p[0].replace(':','').replace('/','').replace('.','')
         pdf_category=p[3]
+        pdf_url=p[1]
+        if pdf_category in ['medRxiv', 'bioRxiv']:
+            pdf_url += '.full.pdf'
+        pdf_author=p[2]
         folder_name=p[4]
         pdf_citation=p[5]
         r = requests.get(pdf_url, allow_redirects=True)