mehradans92 commited on
Commit
277d92a
·
1 Parent(s): 6cbbc77

added unittest, cleaned up

Browse files
Files changed (3) hide show
  1. app.py +1 -2
  2. test/test.py +9 -0
  3. utils.py +20 -256
app.py CHANGED
@@ -43,7 +43,7 @@ def search_click_callback(search_query, max_results, XRxiv_servers=[]):
43
  return pdf_info
44
 
45
  with st.form(key='columns_in_form', clear_on_submit = False):
46
- c1, c2, c3 = st.columns([6, 0.6, 3])
47
  with c1:
48
  search_query = st.text_input("Input search query here:", placeholder='Keywords for most relevant search...', value='CFD Modeling'
49
  )#search_query, max_results_current))
@@ -117,7 +117,6 @@ if submitButton:
117
  st.write(f"{st.session_state['all_reference_text']}")
118
  with st.spinner('⏳ Please wait...'):
119
  start = time.time()
120
- print(word_count)
121
  final_answer = answer_callback(question_query, word_count)
122
  length_answer = len(final_answer)
123
  st.text_area("Answer:", final_answer, height=max(length_answer//4, 100))
 
43
  return pdf_info
44
 
45
  with st.form(key='columns_in_form', clear_on_submit = False):
46
+ c1, c2, c3 = st.columns([5, 0.8, 4])
47
  with c1:
48
  search_query = st.text_input("Input search query here:", placeholder='Keywords for most relevant search...', value='CFD Modeling'
49
  )#search_query, max_results_current))
 
117
  st.write(f"{st.session_state['all_reference_text']}")
118
  with st.spinner('⏳ Please wait...'):
119
  start = time.time()
 
120
  final_answer = answer_callback(question_query, word_count)
121
  length_answer = len(final_answer)
122
  st.text_area("Answer:", final_answer, height=max(length_answer//4, 100))
test/test.py CHANGED
@@ -27,5 +27,14 @@ class Utils(unittest.TestCase):
27
  self.assertTrue(os.path.exists(dowloaded_dir))
28
  shutil.rmtree(f'docs/')
29
 
 
 
 
 
 
 
 
 
 
30
  if __name__ == '__main__':
31
  unittest.main()
 
27
  self.assertTrue(os.path.exists(dowloaded_dir))
28
  shutil.rmtree(f'docs/')
29
 
30
+ def test_distibute_max_papers(self):
31
+ XRxiv_servers = ['rxiv', 'medrxiv']
32
+ max_results = 10
33
+ max_papers_in_server = distibute_max_papers(max_results, XRxiv_servers)
34
+ self.assertEqual(max_results, np.sum(max_papers_in_server))
35
+ self.assertEqual(max_papers_in_server[2], 0)
36
+ self.assertGreater(max_papers_in_server[0],0)
37
+ self.assertGreater(max_papers_in_server[3],0)
38
+
39
  if __name__ == '__main__':
40
  unittest.main()
utils.py CHANGED
@@ -8,7 +8,8 @@ import shutil
8
  import time
9
  from bs4 import BeautifulSoup as bs
10
  from datetime import datetime
11
- from urllib.parse import quote
 
12
 
13
 
14
  class XRxivQuery:
@@ -24,6 +25,7 @@ class XRxivQuery:
24
 
25
  def call_API(self):
26
  search_query = self.search_query.strip().replace(" ", "+").split('+')#.replace(", ", "+").replace(",", "+")#.split('+')
 
27
  if 'rxiv' in self.XRxiv_servers:
28
  '''
29
  Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
@@ -42,13 +44,10 @@ class XRxivQuery:
42
  <category term="stat.TH" scheme="http://arxiv.org/schemas/atom"/>\n
43
  </entry>\n
44
  '''
45
- print('Searching Arxiv\n')
46
  # Call arXiv API
47
  journal = 'arXiv'
48
- # print(" ".join(search_query))
49
- # print(self.search_query.strip().replace(", ", "+").replace(" ", "+").replace(",", "+"))
50
- arXiv_url=f'http://export.arxiv.org/api/query?search_query={self.search_by}:{"+".join(search_query)}&sortBy={self.sort_by}&start=0&max_results={self.max_results}'
51
- # print(arXiv_url)
52
  with urllib.request.urlopen(arXiv_url) as url:
53
  s = url.read()
54
 
@@ -72,7 +71,6 @@ class XRxivQuery:
72
  folder_names.append(self.folder_name)
73
  pdf_years.append(pdf.xpath('updated/text()')[0][:4])
74
  pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. {journal} [{pdf_categories[i][0]}] ({pdf_years[i]}), (available at {pdf_urls[i]}).")
75
- # self.all_pdf_citation.append(pdf_citation)
76
  pdf_info = list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
77
  self.all_pdf_info.append(pdf_info)
78
 
@@ -99,13 +97,16 @@ class XRxivQuery:
99
  </entry>\n
100
  '''
101
  if 'biorxiv' in self.XRxiv_servers and 'medrxiv' not in self.XRxiv_servers:
102
- print('Searching biorxiv\n')
 
103
  journals_str = f'%20jcode%3Abiorxiv'
104
  if 'biorxiv' not in self.XRxiv_servers and 'medrxiv' in self.XRxiv_servers:
105
- print('Searching medrxiv\n')
 
106
  journals_str = f'%20jcode%3Amedrxiv'
107
  if 'biorxiv' in self.XRxiv_servers and 'medrxiv' in self.XRxiv_servers:
108
- print('Searching both biorxiv and medrxiv\n')
 
109
  journals_str = f'%20jcode%3Abiorxiv%7C%7Cmedrxiv'
110
 
111
  subject_str = ('%20').join(self.search_query[0].split())
@@ -116,7 +117,7 @@ class XRxivQuery:
116
  today = str(current_dateTime)[:10]
117
  start_day = '2013-01-01'
118
  arXiv_url = f'https://www.biorxiv.org/search/'
119
- arXiv_url += subject_str + journals_str + f'%20limit_from%3A2{start_day}%20limit_to%3A{today}%20numresults%3A{self.max_results}%20sort%3Arelevance-rank%20format_result%3Astandard'
120
 
121
  url_response = requests.post(arXiv_url)
122
  html = bs(url_response.text, features='html.parser')
@@ -145,7 +146,6 @@ class XRxivQuery:
145
  self.all_pdf_info.append(pdf_info)
146
 
147
  self.all_pdf_info = [item for sublist in self.all_pdf_info for item in sublist]
148
- print(self.all_pdf_info)
149
  return self.all_pdf_info
150
 
151
  def download_pdf(self):
@@ -183,247 +183,11 @@ class XRxivQuery:
183
 
184
 
185
 
186
-
187
-
188
-
189
-
190
-
191
- def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_results='10', folder_name='docs'):
192
- '''
193
- Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
194
- <entry>\n
195
- <id>http://arxiv.org/abs/2008.04584v2</id>\n
196
- <updated>2021-05-11T12:00:24Z</updated>\n
197
- <published>2020-08-11T08:47:06Z</published>\n
198
- <title>Bayesian Selective Inference: Non-informative Priors</title>\n
199
- <summary> We discuss Bayesian inference for parameters selected using the data. First,\nwe provide a critical analysis of the existing positions in the literature\nregarding the correct Bayesian approach under selection. Second, we propose two\ntypes of non-informative priors for selection models. These priors may be\nemployed to produce a posterior distribution in the absence of prior\ninformation as well as to provide well-calibrated frequentist inference for the\nselected parameter. We test the proposed priors empirically in several\nscenarios.\n</summary>\n
200
- <author>\n <name>Daniel G. Rasines</name>\n </author>\n <author>\n <name>G. Alastair Young</name>\n </author>\n
201
- <arxiv:comment xmlns:arxiv="http://arxiv.org/schemas/atom">24 pages, 7 figures</arxiv:comment>\n
202
- <link href="http://arxiv.org/abs/2008.04584v2" rel="alternate" type="text/html"/>\n
203
- <link title="pdf" href="http://arxiv.org/pdf/2008.04584v2" rel="related" type="application/pdf"/>\n
204
- <arxiv:primary_category xmlns:arxiv="http://arxiv.org/schemas/atom" term="math.ST" scheme="http://arxiv.org/schemas/atom"/>\n
205
- <category term="math.ST" scheme="http://arxiv.org/schemas/atom"/>\n
206
- <category term="stat.TH" scheme="http://arxiv.org/schemas/atom"/>\n
207
- </entry>\n
208
- '''
209
-
210
- # Remove space in seach query
211
- search_query=search_query.strip().replace(" ", "+").replace(", ","+").replace(",","+")
212
- # Call arXiv API
213
- arXiv_url=f'http://export.arxiv.org/api/query?search_query={search_by}:{search_query}&sortBy={sort_by}&start=0&max_results={max_results}'
214
- with urllib.request.urlopen(arXiv_url) as url:
215
- s = url.read()
216
-
217
- from lxml import html
218
-
219
- # Parse the xml data
220
- root = html.fromstring(s)
221
- # Fetch relevant pdf information
222
- pdf_entries = root.xpath("entry")
223
-
224
- pdf_titles = []
225
- pdf_authors = []
226
- pdf_urls = []
227
- pdf_categories = []
228
- folder_names = []
229
- pdf_citation = []
230
- pdf_years = []
231
-
232
- for i, pdf in enumerate(pdf_entries):
233
- pdf_titles.append(re.sub('[^a-zA-Z0-9]', ' ', pdf.xpath("title/text()")[0]))
234
- pdf_authors.append(pdf.xpath("author/name/text()"))
235
- pdf_urls.append(pdf.xpath("link[@title='pdf']/@href")[0])
236
- pdf_categories.append(pdf.xpath("category/@term"))
237
- folder_names.append(folder_name)
238
- pdf_years.append(pdf.xpath('updated/text()')[0][:4])
239
- pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. arXiv [{pdf_categories[i][0]}] ({pdf_years[i]}), (available at {pdf_urls[i]}).")
240
-
241
-
242
-
243
- pdf_info=list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
244
-
245
- # Check number of available files
246
- if len(pdf_urls)<int(max_results):
247
- matching_pdf_num=len(pdf_urls)
248
- # print('Only {matching_pdf_num} files available'.format(matching_pdf_num=matching_pdf_num))
249
- return pdf_info, pdf_citation
250
-
251
-
252
- def download_pdf(pdf_info):
253
-
254
- # if len(os.listdir(f'./{folder_name}') ) != 0:
255
- # check folder is empty to avoid using papers from old runs:
256
- # os.remove(f'./{folder_name}/*')
257
- # print(pdf_info)
258
- all_reference_text = []
259
- for i,p in enumerate(stqdm(pdf_info, desc='Searching and downloading papers')):
260
- pdf_title=p[0].replace(':','').replace('/','').replace('.','')
261
- pdf_category=p[3]
262
- pdf_url=p[1]
263
- if pdf_category in ['medRxiv', 'bioRxiv']:
264
- pdf_url += '.full.pdf'
265
- pdf_author=p[2]
266
- folder_name=p[4]
267
- pdf_citation=p[5]
268
- r = requests.get(pdf_url, allow_redirects=True)
269
- if i == 0:
270
- if not os.path.exists(f'{folder_name}'):
271
- os.makedirs(f"{folder_name}")
272
- else:
273
- shutil.rmtree(f'{folder_name}')
274
- os.makedirs(f"{folder_name}")
275
- with open(f'{folder_name}/{pdf_title}.pdf', 'wb') as f:
276
- f.write(r.content)
277
- if i == 0:
278
- st.markdown("###### Papers found:")
279
- st.markdown(f"{i+1}. {pdf_citation}")
280
- time.sleep(0.15)
281
- all_reference_text.append(f"{i+1}. {pdf_citation}\n")
282
- if 'all_reference_text' not in st.session_state:
283
- st.session_state.key = 'all_reference_text'
284
- st.session_state['all_reference_text'] = ' '.join(all_reference_text)
285
-
286
-
287
-
288
- def call_bioArXiv_API(search_query, search_by='all', sort_by='relevance', max_results='10', folder_name='docs'):
289
- '''
290
- Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
291
- <li class="first last odd search-result result-jcode-medrxiv search-result-highwire-citation">
292
- <div class="highwire-article-citation highwire-citation-type-highwire-article node" data-apath="/medrxiv/early/2021/02/18/2021.02.12.21251663.atom" data-pisa="medrxiv;2021.02.12.21251663v1" data-pisa-master="medrxiv;2021.02.12.21251663" id="node-medrxivearly202102182021021221251663atom1512875027"><div class="highwire-cite highwire-cite-highwire-article highwire-citation-biorxiv-article-pap-list clearfix">
293
- <span class="highwire-cite-title">
294
- <a class="highwire-cite-linked-title" data-hide-link-title="0" data-icon-position="" href="http://medrxiv.org/content/early/2021/02/18/2021.02.12.21251663">
295
- <span class="highwire-cite-title">ClinGen Variant Curation Interface: A Variant Classification Platform for the Application of Evidence Criteria from ACMG/AMP Guidelines</span></a> </span>
296
- <div class="highwire-cite-authors"><span class="highwire-citation-authors">
297
- <span class="highwire-citation-author first" data-delta="0"><span class="nlm-given-names">Christine G.</span> <span class="nlm-surname">Preston</span></span>,
298
- <span class="highwire-citation-author" data-delta="1"><span class="nlm-given-names">Matt W.</span> <span class="nlm-surname">Wright</span></span>,
299
- <span class="highwire-citation-author" data-delta="2"><span class="nlm-given-names">Rao</span> <span class="nlm-surname">Madhavrao</span></span>,
300
- <div class="highwire-cite-metadata"><span class="highwire-cite-metadata-journal highwire-cite-metadata">medRxiv </span>
301
- <span class="highwire-cite-metadata-pages highwire-cite-metadata">2021.02.12.21251663; </span><span class="highwire-cite-metadata-doi highwire-cite-metadata">
302
- <span class="doi_label">doi:</span> https://doi.org/10.1101/2021.02.12.21251663 </span></div>
303
- <div class="highwire-cite-extras"><div class="hw-make-citation" data-encoded-apath=";medrxiv;early;2021;02;18;2021.02.12.21251663.atom" data-seqnum="0" id="hw-make-citation-0">
304
- <a class="link-save-citation-save use-ajax hw-link-save-unsave-catation link-icon" href="/highwire-save-citation/saveapath/%3Bmedrxiv%3Bearly%3B2021%3B02%3B18%3B2021.02.12.21251663.atom/nojs/0" id="link-save-citation-toggle-0" title="Save">
305
- <span class="icon-plus"></span> <span class="title">Add to Selected Citations</span></a></div></div>
306
- </div>
307
- </div></li>
308
- </entry>\n
309
- '''
310
-
311
- # Remove space in seach query
312
- search_query=search_query.strip().replace(", ", "+").replace(" ", "+").replace(",", "+").split('+')
313
- subject_str = ('%20').join(search_query[0].split())
314
- for subject in search_query[1:]:
315
- subject_str = subject_str + '%252B' + ('%20').join(subject.split())
316
-
317
- # print(subject_str)
318
- # Call arXiv API
319
- # bio_arXiv_url=f'http://export.arxiv.org/api/query?search_query={search_by}:{search_query}&sortBy={sort_by}&start=0&max_results={max_results}'
320
- # "https://api.biorxiv.org"
321
- current_dateTime = datetime.now()
322
- today = str(current_dateTime)[:10]
323
- journal = 'biorxiv'
324
- # journals_str = '%20jcode%3Amedrxiv%7C%7Cbiorxiv'
325
-
326
- bio_arXiv_url = f'https://www.biorxiv.org/search/'
327
- # kwd_str = 'abstract_title%3A' + ('%252C%2B').join([search_query[0]] + [('%2B').join(keyword.split()) for keyword in search_query[1:]])
328
- # print(kwd_str)
329
- # kwd_str = kwd_str + '%20abstract_title_flags%3Amatch-' + 'all'
330
- # bio_arXiv_url += '%20' + kwd_str
331
-
332
- launch_dates = {"biorxiv": "2013-01-01", "medrxiv": "2019-06-01"}
333
-
334
- both = False
335
- bio_only = True
336
- med_only = False
337
- if bio_only:
338
- print('https://www.biorxiv.org/search/serverless%252Bcomputing%252Bbioinformatics%20jcode%3Abiorxiv%20limit_from%3A2021-06-13%20limit_to%3A2023-02-17%20numresults%3A25%20sort%3Arelevance-rank%20format_result%3Astandard\n bio_only')
339
- journal = 'biorxiv'
340
- journals_str = f'%20jcode%3A{journal}'
341
- if both:
342
- # print('https://www.biorxiv.org/search/serverless%252Bcomputing%252Bbioinformatics%20jcode%3Amedrxiv%7C%7Cbiorxiv%20limit_from%3A2022-11-06%20limit_to%3A2023-02-17%20numresults%3A10%20sort%3Arelevance-rank%20format_result%3Astandard\n both')
343
- journal = 'biorxiv'
344
- journals_str = f'%20jcode%3A{journal}%7C%7Cmedrxiv'
345
- if med_only:
346
- # print('https://www.biorxiv.org/search/serverless%252Bcomputing%252Bbioinformatics%20jcode%3Amedrxiv%20limit_from%3A2021-06-13%20limit_to%3A2023-02-17%20numresults%3A10%20sort%3Arelevance-rank%20format_result%3Astandard\n med_only')
347
- journal = 'medrxiv'
348
- journals_str = f'%20jcode%3A{journal}'
349
- start_day = launch_dates[journal]
350
- bio_arXiv_url += subject_str + journals_str + f'%20limit_from%3A2{start_day}%20limit_to%3A{today}%20numresults%3A{max_results}%20sort%3Arelevance-rank%20format_result%3Astandard'
351
-
352
- # print(bio_arXiv_url)
353
- url_response = requests.post(bio_arXiv_url)
354
- html = bs(url_response.text, features='html.parser')
355
- pdf_entries = html.find_all(attrs={'class': 'search-result'})
356
- # print(articles)
357
-
358
- # with urllib.request.urlopen(bio_arXiv_url) as url:
359
- # s = url.read()
360
- # # Parse the xml data
361
- # root = html.fromstring(s)
362
- # # Fetch relevant pdf information
363
- # pdf_entries = root.xpath("entry")
364
- # print(pdf_entries)
365
- pdf_titles = []
366
- pdf_authors = []
367
- pdf_urls = []
368
- pdf_categories = []
369
- folder_names = []
370
- pdf_citation = []
371
- pdf_years = []
372
-
373
- for i, pdf in enumerate(pdf_entries):
374
- # print(pdf.xpath('updated/text()')[0][:4])
375
- # xpath return a list with every ocurrence of the html path. Since we're getting each entry individually, we'll take the first element to avoid an unecessary list
376
- # print(pdf)
377
- # [article.find('span', attrs={'class': 'highwire-cite-title'}).text.strip() if article.find('span', attrs={'class': 'highwire-cite-title'}) is not None else None for article in articles]
378
- pdf_titles.append(pdf.find('span', attrs={'class': 'highwire-cite-title'}).text.strip())
379
- # print(pdf.find('span', attrs={'class': 'highwire-citation-authors'}).text.strip())
380
- pdf_authors.append(pdf.find('span', attrs={'class': 'highwire-citation-authors'}).text.strip().split(', '))
381
- # print(pdf_authors)
382
-
383
- # print(f'http://www.{journal}.org')
384
- pdf_url = pdf.find('a', href=True)['href']
385
- if pdf_url[:4] != 'http':
386
- pdf_url = f'http://www.biorxiv.org'+ pdf_url
387
- pdf_urls.append(pdf_url)
388
- pdf_categories.append(pdf.find('span', attrs={'class': 'highwire-cite-metadata-journal highwire-cite-metadata'}).text.strip())
389
- # print(pdf_categories)
390
- folder_names.append(folder_name)
391
- pdf_years.append(pdf.find('span', attrs={'class': 'highwire-cite-metadata-pages highwire-cite-metadata'}).text.strip()[:4])
392
-
393
- pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. {pdf_categories[i]} ({pdf_years[i]}), (available at {pdf_urls[i]}).")
394
- # print(pdf_citation)
395
-
396
- # break
397
-
398
-
399
-
400
- pdf_info=list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
401
-
402
- # Check number of available files
403
- print('Requesting {max_results} files'.format(max_results=max_results))
404
- if len(pdf_urls)<int(max_results):
405
- matching_pdf_num=len(pdf_urls)
406
- print('Only {matching_pdf_num} files available'.format(matching_pdf_num=matching_pdf_num))
407
- return pdf_info
408
-
409
- import urllib.request as urllib2
410
-
411
- def download_bio_pdf(pdf_info):
412
- for p in tqdm(pdf_info):
413
- pdf_title=p[0].replace(':','').replace('/','-').replace('.','')
414
- pdf_url=p[1] + '.full.pdf'
415
- # print(pdf_url)
416
- pdf_author=p[2]
417
- pdf_category=p[3]
418
- print(pdf_category)
419
- folder_name=p[4]
420
- pdf_citation=p[5]
421
- r = requests.get(pdf_url, allow_redirects=True)
422
- # print(r)
423
- print(pdf_url)
424
- # r = requests.get(pdf_url, stream=True)
425
- if not os.path.exists(folder_name):
426
- os.makedirs(f"{folder_name}")
427
- with open(f'{folder_name}/{pdf_title}.pdf', 'wb') as f:
428
- f.write(r.content)
429
-
 
8
  import time
9
  from bs4 import BeautifulSoup as bs
10
  from datetime import datetime
11
+ from random import uniform as rand
12
+ import numpy as np
13
 
14
 
15
  class XRxivQuery:
 
25
 
26
  def call_API(self):
27
  search_query = self.search_query.strip().replace(" ", "+").split('+')#.replace(", ", "+").replace(",", "+")#.split('+')
28
+ max_papers_in_server = distibute_max_papers(self.max_results, self.XRxiv_servers)
29
  if 'rxiv' in self.XRxiv_servers:
30
  '''
31
  Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
 
44
  <category term="stat.TH" scheme="http://arxiv.org/schemas/atom"/>\n
45
  </entry>\n
46
  '''
 
47
  # Call arXiv API
48
  journal = 'arXiv'
49
+ max_rxiv_papers = max_papers_in_server[0]
50
+ arXiv_url=f'http://export.arxiv.org/api/query?search_query={self.search_by}:{"+".join(search_query)}&sortBy={self.sort_by}&start=0&max_results={max_rxiv_papers}'
 
 
51
  with urllib.request.urlopen(arXiv_url) as url:
52
  s = url.read()
53
 
 
71
  folder_names.append(self.folder_name)
72
  pdf_years.append(pdf.xpath('updated/text()')[0][:4])
73
  pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. {journal} [{pdf_categories[i][0]}] ({pdf_years[i]}), (available at {pdf_urls[i]}).")
 
74
  pdf_info = list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
75
  self.all_pdf_info.append(pdf_info)
76
 
 
97
  </entry>\n
98
  '''
99
  if 'biorxiv' in self.XRxiv_servers and 'medrxiv' not in self.XRxiv_servers:
100
+ # print('Searching biorxiv\n')
101
+ max_biorxiv_papers = max_papers_in_server[2]
102
  journals_str = f'%20jcode%3Abiorxiv'
103
  if 'biorxiv' not in self.XRxiv_servers and 'medrxiv' in self.XRxiv_servers:
104
+ # print('Searching medrxiv\n')
105
+ max_biorxiv_papers = max_papers_in_server[3]
106
  journals_str = f'%20jcode%3Amedrxiv'
107
  if 'biorxiv' in self.XRxiv_servers and 'medrxiv' in self.XRxiv_servers:
108
+ # print('Searching both biorxiv and medrxiv\n')
109
+ max_biorxiv_papers = max_papers_in_server[3]+ max_papers_in_server[2] # birxiv and medrxiv are together.
110
  journals_str = f'%20jcode%3Abiorxiv%7C%7Cmedrxiv'
111
 
112
  subject_str = ('%20').join(self.search_query[0].split())
 
117
  today = str(current_dateTime)[:10]
118
  start_day = '2013-01-01'
119
  arXiv_url = f'https://www.biorxiv.org/search/'
120
+ arXiv_url += subject_str + journals_str + f'%20limit_from%3A2{start_day}%20limit_to%3A{today}%20numresults%3A{max_biorxiv_papers}%20sort%3Arelevance-rank%20format_result%3Astandard'
121
 
122
  url_response = requests.post(arXiv_url)
123
  html = bs(url_response.text, features='html.parser')
 
146
  self.all_pdf_info.append(pdf_info)
147
 
148
  self.all_pdf_info = [item for sublist in self.all_pdf_info for item in sublist]
 
149
  return self.all_pdf_info
150
 
151
  def download_pdf(self):
 
183
 
184
 
185
 
186
+ def distibute_max_papers(max_results, XRxiv_servers):
187
+ fixed_length = len(XRxiv_servers)
188
+ sample = np.random.multinomial(max_results - fixed_length, np.ones(fixed_length)/fixed_length, size=1)[0] + 1
189
+ max_papers_in_server = np.zeros(4, dtype=int)
190
+ all_servers = ['rxiv', 'chemrxiv', 'biorxiv', 'medrxiv']
191
+ for i,s in enumerate(XRxiv_servers):
192
+ max_papers_in_server[all_servers.index(s)] = int(sample[i])
193
+ return max_papers_in_server