mehradans92 commited on
Commit
64cac75
·
1 Parent(s): 75c3b48

fixed download arxiv

Browse files
Files changed (1) hide show
  1. utils.py +5 -5
utils.py CHANGED
@@ -8,6 +8,7 @@ import os
8
  import shutil
9
  import time
10
 
 
11
  def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_results='10', folder_name='docs'):
12
  '''
13
  Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
@@ -48,8 +49,6 @@ def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_resul
48
  pdf_years = []
49
 
50
  for i, pdf in enumerate(pdf_entries):
51
- # print(pdf.xpath('updated/text()')[0][:4])
52
- # xpath return a list with every ocurrence of the html path. Since we're getting each entry individually, we'll take the first element to avoid an unecessary list
53
  pdf_titles.append(re.sub('[^a-zA-Z0-9]', ' ', pdf.xpath("title/text()")[0]))
54
  pdf_authors.append(pdf.xpath("author/name/text()"))
55
  pdf_urls.append(pdf.xpath("link[@title='pdf']/@href")[0])
@@ -63,7 +62,6 @@ def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_resul
63
  pdf_info=list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
64
 
65
  # Check number of available files
66
- # print('Requesting {max_results} files'.format(max_results=max_results))
67
  if len(pdf_urls)<int(max_results):
68
  matching_pdf_num=len(pdf_urls)
69
  # print('Only {matching_pdf_num} files available'.format(matching_pdf_num=matching_pdf_num))
@@ -79,9 +77,11 @@ def download_pdf(pdf_info):
79
  all_reference_text = []
80
  for i,p in enumerate(stqdm(pdf_info, desc='Searching and downloading papers')):
81
  pdf_title=p[0].replace(':','').replace('/','').replace('.','')
82
- pdf_url=p[1] + '.full.pdf'
83
- pdf_author=p[2]
84
  pdf_category=p[3]
 
 
 
 
85
  folder_name=p[4]
86
  pdf_citation=p[5]
87
  r = requests.get(pdf_url, allow_redirects=True)
 
8
  import shutil
9
  import time
10
 
11
+
12
  def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_results='10', folder_name='docs'):
13
  '''
14
  Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
 
49
  pdf_years = []
50
 
51
  for i, pdf in enumerate(pdf_entries):
 
 
52
  pdf_titles.append(re.sub('[^a-zA-Z0-9]', ' ', pdf.xpath("title/text()")[0]))
53
  pdf_authors.append(pdf.xpath("author/name/text()"))
54
  pdf_urls.append(pdf.xpath("link[@title='pdf']/@href")[0])
 
62
  pdf_info=list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
63
 
64
  # Check number of available files
 
65
  if len(pdf_urls)<int(max_results):
66
  matching_pdf_num=len(pdf_urls)
67
  # print('Only {matching_pdf_num} files available'.format(matching_pdf_num=matching_pdf_num))
 
77
  all_reference_text = []
78
  for i,p in enumerate(stqdm(pdf_info, desc='Searching and downloading papers')):
79
  pdf_title=p[0].replace(':','').replace('/','').replace('.','')
 
 
80
  pdf_category=p[3]
81
+ pdf_url=p[1]
82
+ if pdf_category in ['medRxiv', 'bioRxiv']:
83
+ pdf_url += '.full.pdf'
84
+ pdf_author=p[2]
85
  folder_name=p[4]
86
  pdf_citation=p[5]
87
  r = requests.get(pdf_url, allow_redirects=True)