mehradans92 commited on
Commit
e5ddac0
·
1 Parent(s): b06418a

aded arxiv API class

Browse files
test/__pycache__/__init__.cpython-38.pyc DELETED
Binary file (151 Bytes)
 
test/__pycache__/test.cpython-38.pyc DELETED
Binary file (1.33 kB)
 
utils.py CHANGED
@@ -1,5 +1,4 @@
1
  import urllib
2
- from lxml import html
3
  import streamlit as st
4
  import requests
5
  import re
@@ -7,8 +6,188 @@ from stqdm import stqdm
7
  import os
8
  import shutil
9
  import time
 
 
 
10
 
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_results='10', folder_name='docs'):
13
  '''
14
  Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
@@ -29,11 +208,13 @@ def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_resul
29
  '''
30
 
31
  # Remove space in seach query
32
- search_query=search_query.strip().replace(" ", "+")
33
  # Call arXiv API
34
  arXiv_url=f'http://export.arxiv.org/api/query?search_query={search_by}:{search_query}&sortBy={sort_by}&start=0&max_results={max_results}'
35
  with urllib.request.urlopen(arXiv_url) as url:
36
  s = url.read()
 
 
37
 
38
  # Parse the xml data
39
  root = html.fromstring(s)
@@ -101,3 +282,148 @@ def download_pdf(pdf_info):
101
  if 'all_reference_text' not in st.session_state:
102
  st.session_state.key = 'all_reference_text'
103
  st.session_state['all_reference_text'] = ' '.join(all_reference_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import urllib
 
2
  import streamlit as st
3
  import requests
4
  import re
 
6
  import os
7
  import shutil
8
  import time
9
+ from bs4 import BeautifulSoup as bs
10
+ from datetime import datetime
11
+ from urllib.parse import quote
12
 
13
 
14
+ class XRxivQuery:
15
+ def __init__(self, search_query, max_results, folder_name='docs', XRxiv_servers = [], search_by='all', sort_by='relevance'):
16
+ self.search_query = search_query
17
+ self.max_results = max_results
18
+ self.folder_name = folder_name
19
+ self.XRxiv_servers = XRxiv_servers
20
+ self.search_by = search_by
21
+ self.sort_by = sort_by
22
+ self.all_pdf_info = []
23
+ self.all_pdf_citation = []
24
+
25
+ def call_API(self):
26
+ search_query = self.search_query.strip().replace(" ", "+").split('+')#.replace(", ", "+").replace(",", "+")#.split('+')
27
+ if 'rxiv' in self.XRxiv_servers:
28
+ '''
29
+ Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
30
+ <entry>\n
31
+ <id>http://arxiv.org/abs/2008.04584v2</id>\n
32
+ <updated>2021-05-11T12:00:24Z</updated>\n
33
+ <published>2020-08-11T08:47:06Z</published>\n
34
+ <title>Bayesian Selective Inference: Non-informative Priors</title>\n
35
+ <summary> We discuss Bayesian inference for parameters selected using the data. First,\nwe provide a critical analysis of the existing positions in the literature\nregarding the correct Bayesian approach under selection. Second, we propose two\ntypes of non-informative priors for selection models. These priors may be\nemployed to produce a posterior distribution in the absence of prior\ninformation as well as to provide well-calibrated frequentist inference for the\nselected parameter. We test the proposed priors empirically in several\nscenarios.\n</summary>\n
36
+ <author>\n <name>Daniel G. Rasines</name>\n </author>\n <author>\n <name>G. Alastair Young</name>\n </author>\n
37
+ <arxiv:comment xmlns:arxiv="http://arxiv.org/schemas/atom">24 pages, 7 figures</arxiv:comment>\n
38
+ <link href="http://arxiv.org/abs/2008.04584v2" rel="alternate" type="text/html"/>\n
39
+ <link title="pdf" href="http://arxiv.org/pdf/2008.04584v2" rel="related" type="application/pdf"/>\n
40
+ <arxiv:primary_category xmlns:arxiv="http://arxiv.org/schemas/atom" term="math.ST" scheme="http://arxiv.org/schemas/atom"/>\n
41
+ <category term="math.ST" scheme="http://arxiv.org/schemas/atom"/>\n
42
+ <category term="stat.TH" scheme="http://arxiv.org/schemas/atom"/>\n
43
+ </entry>\n
44
+ '''
45
+ print('Searching Arxiv\n')
46
+ # Call arXiv API
47
+ journal = 'arXiv'
48
+ # print(" ".join(search_query))
49
+ # print(self.search_query.strip().replace(", ", "+").replace(" ", "+").replace(",", "+"))
50
+ arXiv_url=f'http://export.arxiv.org/api/query?search_query={self.search_by}:{"+".join(search_query)}&sortBy={self.sort_by}&start=0&max_results={self.max_results}'
51
+ # print(arXiv_url)
52
+ with urllib.request.urlopen(arXiv_url) as url:
53
+ s = url.read()
54
+
55
+ # Parse the xml data
56
+ from lxml import html
57
+ root = html.fromstring(s)
58
+ # Fetch relevant pdf information
59
+ pdf_entries = root.xpath("entry")
60
+ pdf_titles = []
61
+ pdf_authors = []
62
+ pdf_urls = []
63
+ pdf_categories = []
64
+ folder_names = []
65
+ pdf_citation = []
66
+ pdf_years = []
67
+ for i, pdf in enumerate(pdf_entries):
68
+ pdf_titles.append(re.sub('[^a-zA-Z0-9]', ' ', pdf.xpath("title/text()")[0]))
69
+ pdf_authors.append(pdf.xpath("author/name/text()"))
70
+ pdf_urls.append(pdf.xpath("link[@title='pdf']/@href")[0])
71
+ pdf_categories.append(pdf.xpath("category/@term"))
72
+ folder_names.append(self.folder_name)
73
+ pdf_years.append(pdf.xpath('updated/text()')[0][:4])
74
+ pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. {journal} [{pdf_categories[i][0]}] ({pdf_years[i]}), (available at {pdf_urls[i]}).")
75
+ # self.all_pdf_citation.append(pdf_citation)
76
+ pdf_info = list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
77
+ self.all_pdf_info.append(pdf_info)
78
+
79
+ if 'biorxiv' in self.XRxiv_servers or 'medrxiv' in self.XRxiv_servers:
80
+ '''
81
+ Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
82
+ <li class="first last odd search-result result-jcode-medrxiv search-result-highwire-citation">
83
+ <div class="highwire-article-citation highwire-citation-type-highwire-article node" data-apath="/medrxiv/early/2021/02/18/2021.02.12.21251663.atom" data-pisa="medrxiv;2021.02.12.21251663v1" data-pisa-master="medrxiv;2021.02.12.21251663" id="node-medrxivearly202102182021021221251663atom1512875027"><div class="highwire-cite highwire-cite-highwire-article highwire-citation-biorxiv-article-pap-list clearfix">
84
+ <span class="highwire-cite-title">
85
+ <a class="highwire-cite-linked-title" data-hide-link-title="0" data-icon-position="" href="http://medrxiv.org/content/early/2021/02/18/2021.02.12.21251663">
86
+ <span class="highwire-cite-title">ClinGen Variant Curation Interface: A Variant Classification Platform for the Application of Evidence Criteria from ACMG/AMP Guidelines</span></a> </span>
87
+ <div class="highwire-cite-authors"><span class="highwire-citation-authors">
88
+ <span class="highwire-citation-author first" data-delta="0"><span class="nlm-given-names">Christine G.</span> <span class="nlm-surname">Preston</span></span>,
89
+ <span class="highwire-citation-author" data-delta="1"><span class="nlm-given-names">Matt W.</span> <span class="nlm-surname">Wright</span></span>,
90
+ <span class="highwire-citation-author" data-delta="2"><span class="nlm-given-names">Rao</span> <span class="nlm-surname">Madhavrao</span></span>,
91
+ <div class="highwire-cite-metadata"><span class="highwire-cite-metadata-journal highwire-cite-metadata">medRxiv </span>
92
+ <span class="highwire-cite-metadata-pages highwire-cite-metadata">2021.02.12.21251663; </span><span class="highwire-cite-metadata-doi highwire-cite-metadata">
93
+ <span class="doi_label">doi:</span> https://doi.org/10.1101/2021.02.12.21251663 </span></div>
94
+ <div class="highwire-cite-extras"><div class="hw-make-citation" data-encoded-apath=";medrxiv;early;2021;02;18;2021.02.12.21251663.atom" data-seqnum="0" id="hw-make-citation-0">
95
+ <a class="link-save-citation-save use-ajax hw-link-save-unsave-catation link-icon" href="/highwire-save-citation/saveapath/%3Bmedrxiv%3Bearly%3B2021%3B02%3B18%3B2021.02.12.21251663.atom/nojs/0" id="link-save-citation-toggle-0" title="Save">
96
+ <span class="icon-plus"></span> <span class="title">Add to Selected Citations</span></a></div></div>
97
+ </div>
98
+ </div></li>
99
+ </entry>\n
100
+ '''
101
+ if 'biorxiv' in self.XRxiv_servers and 'medrxiv' not in self.XRxiv_servers:
102
+ print('Searching biorxiv\n')
103
+ journals_str = f'%20jcode%3Abiorxiv'
104
+ if 'biorxiv' not in self.XRxiv_servers and 'medrxiv' in self.XRxiv_servers:
105
+ print('Searching medrxiv\n')
106
+ journals_str = f'%20jcode%3Amedrxiv'
107
+ if 'biorxiv' in self.XRxiv_servers and 'medrxiv' in self.XRxiv_servers:
108
+ print('Searching both biorxiv and medrxiv\n')
109
+ journals_str = f'%20jcode%3Abiorxiv%7C%7Cmedrxiv'
110
+
111
+ subject_str = ('%20').join(self.search_query[0].split())
112
+ for subject in search_query[1:]:
113
+ subject_str = subject_str + '%252B' + ('%20').join(subject.split())
114
+
115
+ current_dateTime = datetime.now()
116
+ today = str(current_dateTime)[:10]
117
+ start_day = '2013-01-01'
118
+ arXiv_url = f'https://www.biorxiv.org/search/'
119
+ arXiv_url += subject_str + journals_str + f'%20limit_from%3A2{start_day}%20limit_to%3A{today}%20numresults%3A{self.max_results}%20sort%3Arelevance-rank%20format_result%3Astandard'
120
+
121
+ url_response = requests.post(arXiv_url)
122
+ html = bs(url_response.text, features='html.parser')
123
+ pdf_entries = html.find_all(attrs={'class': 'search-result'})
124
+ pdf_titles = []
125
+ pdf_authors = []
126
+ pdf_urls = []
127
+ pdf_categories = []
128
+ folder_names = []
129
+ pdf_citation = []
130
+ pdf_years = []
131
+ for i, pdf in enumerate(pdf_entries):
132
+ pdf_titles.append(pdf.find('span', attrs={'class': 'highwire-cite-title'}).text.strip())
133
+ pdf_authors.append(pdf.find('span', attrs={'class': 'highwire-citation-authors'}).text.strip().split(', '))
134
+
135
+ pdf_url = pdf.find('a', href=True)['href']
136
+ if pdf_url[:4] != 'http':
137
+ pdf_url = f'http://www.biorxiv.org'+ pdf_url
138
+ pdf_urls.append(pdf_url)
139
+ pdf_categories.append(pdf.find('span', attrs={'class': 'highwire-cite-metadata-journal highwire-cite-metadata'}).text.strip())
140
+ folder_names.append(self.folder_name)
141
+ pdf_years.append(pdf.find('span', attrs={'class': 'highwire-cite-metadata-pages highwire-cite-metadata'}).text.strip()[:4])
142
+ pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. {pdf_categories[i]} ({pdf_years[i]}), (available at {pdf_urls[i]}).")
143
+
144
+ pdf_info = list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
145
+ self.all_pdf_info.append(pdf_info)
146
+
147
+ self.all_pdf_info = [item for sublist in self.all_pdf_info for item in sublist]
148
+ print(self.all_pdf_info)
149
+ return self.all_pdf_info
150
+
151
+ def download_pdf(self):
152
+ # if len(os.listdir(f'./{folder_name}') ) != 0:
153
+ # check folder is empty to avoid using papers from old runs:
154
+ # os.remove(f'./{folder_name}/*')
155
+ # print(pdf_info)
156
+ all_reference_text = []
157
+ for i,p in enumerate(stqdm(self.all_pdf_info, desc='Searching and downloading papers')):
158
+ pdf_title=p[0]
159
+ pdf_category=p[3]
160
+ pdf_url=p[1]
161
+ if pdf_category in ['medRxiv', 'bioRxiv']:
162
+ pdf_url += '.full.pdf'
163
+ pdf_file_name=p[0].replace(':','').replace('/','').replace('.','')
164
+ folder_name=p[4]
165
+ pdf_citation=p[5]
166
+ r = requests.get(pdf_url, allow_redirects=True)
167
+ if i == 0:
168
+ if not os.path.exists(f'{folder_name}'):
169
+ os.makedirs(f"{folder_name}")
170
+ else:
171
+ shutil.rmtree(f'{folder_name}')
172
+ os.makedirs(f"{folder_name}")
173
+ with open(f'{folder_name}/{pdf_file_name}.pdf', 'wb') as f:
174
+ f.write(r.content)
175
+ if i == 0:
176
+ st.markdown("###### Papers found:")
177
+ st.markdown(f"{i+1}. {pdf_citation}")
178
+ time.sleep(0.15)
179
+ all_reference_text.append(f"{i+1}. {pdf_citation}\n")
180
+ if 'all_reference_text' not in st.session_state:
181
+ st.session_state.key = 'all_reference_text'
182
+ st.session_state['all_reference_text'] = ' '.join(all_reference_text)
183
+
184
+
185
+
186
+
187
+
188
+
189
+
190
+
191
  def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_results='10', folder_name='docs'):
192
  '''
193
  Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
 
208
  '''
209
 
210
  # Remove space in seach query
211
+ search_query=search_query.strip().replace(" ", "+").replace(", ","+").replace(",","+")
212
  # Call arXiv API
213
  arXiv_url=f'http://export.arxiv.org/api/query?search_query={search_by}:{search_query}&sortBy={sort_by}&start=0&max_results={max_results}'
214
  with urllib.request.urlopen(arXiv_url) as url:
215
  s = url.read()
216
+
217
+ from lxml import html
218
 
219
  # Parse the xml data
220
  root = html.fromstring(s)
 
282
  if 'all_reference_text' not in st.session_state:
283
  st.session_state.key = 'all_reference_text'
284
  st.session_state['all_reference_text'] = ' '.join(all_reference_text)
285
+
286
+
287
+
288
+ def call_bioArXiv_API(search_query, search_by='all', sort_by='relevance', max_results='10', folder_name='docs'):
289
+ '''
290
+ Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
291
+ <li class="first last odd search-result result-jcode-medrxiv search-result-highwire-citation">
292
+ <div class="highwire-article-citation highwire-citation-type-highwire-article node" data-apath="/medrxiv/early/2021/02/18/2021.02.12.21251663.atom" data-pisa="medrxiv;2021.02.12.21251663v1" data-pisa-master="medrxiv;2021.02.12.21251663" id="node-medrxivearly202102182021021221251663atom1512875027"><div class="highwire-cite highwire-cite-highwire-article highwire-citation-biorxiv-article-pap-list clearfix">
293
+ <span class="highwire-cite-title">
294
+ <a class="highwire-cite-linked-title" data-hide-link-title="0" data-icon-position="" href="http://medrxiv.org/content/early/2021/02/18/2021.02.12.21251663">
295
+ <span class="highwire-cite-title">ClinGen Variant Curation Interface: A Variant Classification Platform for the Application of Evidence Criteria from ACMG/AMP Guidelines</span></a> </span>
296
+ <div class="highwire-cite-authors"><span class="highwire-citation-authors">
297
+ <span class="highwire-citation-author first" data-delta="0"><span class="nlm-given-names">Christine G.</span> <span class="nlm-surname">Preston</span></span>,
298
+ <span class="highwire-citation-author" data-delta="1"><span class="nlm-given-names">Matt W.</span> <span class="nlm-surname">Wright</span></span>,
299
+ <span class="highwire-citation-author" data-delta="2"><span class="nlm-given-names">Rao</span> <span class="nlm-surname">Madhavrao</span></span>,
300
+ <div class="highwire-cite-metadata"><span class="highwire-cite-metadata-journal highwire-cite-metadata">medRxiv </span>
301
+ <span class="highwire-cite-metadata-pages highwire-cite-metadata">2021.02.12.21251663; </span><span class="highwire-cite-metadata-doi highwire-cite-metadata">
302
+ <span class="doi_label">doi:</span> https://doi.org/10.1101/2021.02.12.21251663 </span></div>
303
+ <div class="highwire-cite-extras"><div class="hw-make-citation" data-encoded-apath=";medrxiv;early;2021;02;18;2021.02.12.21251663.atom" data-seqnum="0" id="hw-make-citation-0">
304
+ <a class="link-save-citation-save use-ajax hw-link-save-unsave-catation link-icon" href="/highwire-save-citation/saveapath/%3Bmedrxiv%3Bearly%3B2021%3B02%3B18%3B2021.02.12.21251663.atom/nojs/0" id="link-save-citation-toggle-0" title="Save">
305
+ <span class="icon-plus"></span> <span class="title">Add to Selected Citations</span></a></div></div>
306
+ </div>
307
+ </div></li>
308
+ </entry>\n
309
+ '''
310
+
311
+ # Remove space in seach query
312
+ search_query=search_query.strip().replace(", ", "+").replace(" ", "+").replace(",", "+").split('+')
313
+ subject_str = ('%20').join(search_query[0].split())
314
+ for subject in search_query[1:]:
315
+ subject_str = subject_str + '%252B' + ('%20').join(subject.split())
316
+
317
+ # print(subject_str)
318
+ # Call arXiv API
319
+ # bio_arXiv_url=f'http://export.arxiv.org/api/query?search_query={search_by}:{search_query}&sortBy={sort_by}&start=0&max_results={max_results}'
320
+ # "https://api.biorxiv.org"
321
+ current_dateTime = datetime.now()
322
+ today = str(current_dateTime)[:10]
323
+ journal = 'biorxiv'
324
+ # journals_str = '%20jcode%3Amedrxiv%7C%7Cbiorxiv'
325
+
326
+ bio_arXiv_url = f'https://www.biorxiv.org/search/'
327
+ # kwd_str = 'abstract_title%3A' + ('%252C%2B').join([search_query[0]] + [('%2B').join(keyword.split()) for keyword in search_query[1:]])
328
+ # print(kwd_str)
329
+ # kwd_str = kwd_str + '%20abstract_title_flags%3Amatch-' + 'all'
330
+ # bio_arXiv_url += '%20' + kwd_str
331
+
332
+ launch_dates = {"biorxiv": "2013-01-01", "medrxiv": "2019-06-01"}
333
+
334
+ both = False
335
+ bio_only = True
336
+ med_only = False
337
+ if bio_only:
338
+ print('https://www.biorxiv.org/search/serverless%252Bcomputing%252Bbioinformatics%20jcode%3Abiorxiv%20limit_from%3A2021-06-13%20limit_to%3A2023-02-17%20numresults%3A25%20sort%3Arelevance-rank%20format_result%3Astandard\n bio_only')
339
+ journal = 'biorxiv'
340
+ journals_str = f'%20jcode%3A{journal}'
341
+ if both:
342
+ # print('https://www.biorxiv.org/search/serverless%252Bcomputing%252Bbioinformatics%20jcode%3Amedrxiv%7C%7Cbiorxiv%20limit_from%3A2022-11-06%20limit_to%3A2023-02-17%20numresults%3A10%20sort%3Arelevance-rank%20format_result%3Astandard\n both')
343
+ journal = 'biorxiv'
344
+ journals_str = f'%20jcode%3A{journal}%7C%7Cmedrxiv'
345
+ if med_only:
346
+ # print('https://www.biorxiv.org/search/serverless%252Bcomputing%252Bbioinformatics%20jcode%3Amedrxiv%20limit_from%3A2021-06-13%20limit_to%3A2023-02-17%20numresults%3A10%20sort%3Arelevance-rank%20format_result%3Astandard\n med_only')
347
+ journal = 'medrxiv'
348
+ journals_str = f'%20jcode%3A{journal}'
349
+ start_day = launch_dates[journal]
350
+ bio_arXiv_url += subject_str + journals_str + f'%20limit_from%3A2{start_day}%20limit_to%3A{today}%20numresults%3A{max_results}%20sort%3Arelevance-rank%20format_result%3Astandard'
351
+
352
+ # print(bio_arXiv_url)
353
+ url_response = requests.post(bio_arXiv_url)
354
+ html = bs(url_response.text, features='html.parser')
355
+ pdf_entries = html.find_all(attrs={'class': 'search-result'})
356
+ # print(articles)
357
+
358
+ # with urllib.request.urlopen(bio_arXiv_url) as url:
359
+ # s = url.read()
360
+ # # Parse the xml data
361
+ # root = html.fromstring(s)
362
+ # # Fetch relevant pdf information
363
+ # pdf_entries = root.xpath("entry")
364
+ # print(pdf_entries)
365
+ pdf_titles = []
366
+ pdf_authors = []
367
+ pdf_urls = []
368
+ pdf_categories = []
369
+ folder_names = []
370
+ pdf_citation = []
371
+ pdf_years = []
372
+
373
+ for i, pdf in enumerate(pdf_entries):
374
+ # print(pdf.xpath('updated/text()')[0][:4])
375
+ # xpath return a list with every ocurrence of the html path. Since we're getting each entry individually, we'll take the first element to avoid an unecessary list
376
+ # print(pdf)
377
+ # [article.find('span', attrs={'class': 'highwire-cite-title'}).text.strip() if article.find('span', attrs={'class': 'highwire-cite-title'}) is not None else None for article in articles]
378
+ pdf_titles.append(pdf.find('span', attrs={'class': 'highwire-cite-title'}).text.strip())
379
+ # print(pdf.find('span', attrs={'class': 'highwire-citation-authors'}).text.strip())
380
+ pdf_authors.append(pdf.find('span', attrs={'class': 'highwire-citation-authors'}).text.strip().split(', '))
381
+ # print(pdf_authors)
382
+
383
+ # print(f'http://www.{journal}.org')
384
+ pdf_url = pdf.find('a', href=True)['href']
385
+ if pdf_url[:4] != 'http':
386
+ pdf_url = f'http://www.biorxiv.org'+ pdf_url
387
+ pdf_urls.append(pdf_url)
388
+ pdf_categories.append(pdf.find('span', attrs={'class': 'highwire-cite-metadata-journal highwire-cite-metadata'}).text.strip())
389
+ # print(pdf_categories)
390
+ folder_names.append(folder_name)
391
+ pdf_years.append(pdf.find('span', attrs={'class': 'highwire-cite-metadata-pages highwire-cite-metadata'}).text.strip()[:4])
392
+
393
+ pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. {pdf_categories[i]} ({pdf_years[i]}), (available at {pdf_urls[i]}).")
394
+ # print(pdf_citation)
395
+
396
+ # break
397
+
398
+
399
+
400
+ pdf_info=list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
401
+
402
+ # Check number of available files
403
+ print('Requesting {max_results} files'.format(max_results=max_results))
404
+ if len(pdf_urls)<int(max_results):
405
+ matching_pdf_num=len(pdf_urls)
406
+ print('Only {matching_pdf_num} files available'.format(matching_pdf_num=matching_pdf_num))
407
+ return pdf_info
408
+
409
+ import urllib.request as urllib2
410
+
411
+ def download_bio_pdf(pdf_info):
412
+ for p in tqdm(pdf_info):
413
+ pdf_title=p[0].replace(':','').replace('/','-').replace('.','')
414
+ pdf_url=p[1] + '.full.pdf'
415
+ # print(pdf_url)
416
+ pdf_author=p[2]
417
+ pdf_category=p[3]
418
+ print(pdf_category)
419
+ folder_name=p[4]
420
+ pdf_citation=p[5]
421
+ r = requests.get(pdf_url, allow_redirects=True)
422
+ # print(r)
423
+ print(pdf_url)
424
+ # r = requests.get(pdf_url, stream=True)
425
+ if not os.path.exists(folder_name):
426
+ os.makedirs(f"{folder_name}")
427
+ with open(f'{folder_name}/{pdf_title}.pdf', 'wb') as f:
428
+ f.write(r.content)
429
+