Spaces:
Running
Running
Commit
·
277d92a
1
Parent(s):
6cbbc77
added unittest, cleaned up
Browse files- app.py +1 -2
- test/test.py +9 -0
- utils.py +20 -256
app.py
CHANGED
@@ -43,7 +43,7 @@ def search_click_callback(search_query, max_results, XRxiv_servers=[]):
|
|
43 |
return pdf_info
|
44 |
|
45 |
with st.form(key='columns_in_form', clear_on_submit = False):
|
46 |
-
c1, c2, c3 = st.columns([
|
47 |
with c1:
|
48 |
search_query = st.text_input("Input search query here:", placeholder='Keywords for most relevant search...', value='CFD Modeling'
|
49 |
)#search_query, max_results_current))
|
@@ -117,7 +117,6 @@ if submitButton:
|
|
117 |
st.write(f"{st.session_state['all_reference_text']}")
|
118 |
with st.spinner('⏳ Please wait...'):
|
119 |
start = time.time()
|
120 |
-
print(word_count)
|
121 |
final_answer = answer_callback(question_query, word_count)
|
122 |
length_answer = len(final_answer)
|
123 |
st.text_area("Answer:", final_answer, height=max(length_answer//4, 100))
|
|
|
43 |
return pdf_info
|
44 |
|
45 |
with st.form(key='columns_in_form', clear_on_submit = False):
|
46 |
+
c1, c2, c3 = st.columns([5, 0.8, 4])
|
47 |
with c1:
|
48 |
search_query = st.text_input("Input search query here:", placeholder='Keywords for most relevant search...', value='CFD Modeling'
|
49 |
)#search_query, max_results_current))
|
|
|
117 |
st.write(f"{st.session_state['all_reference_text']}")
|
118 |
with st.spinner('⏳ Please wait...'):
|
119 |
start = time.time()
|
|
|
120 |
final_answer = answer_callback(question_query, word_count)
|
121 |
length_answer = len(final_answer)
|
122 |
st.text_area("Answer:", final_answer, height=max(length_answer//4, 100))
|
test/test.py
CHANGED
@@ -27,5 +27,14 @@ class Utils(unittest.TestCase):
|
|
27 |
self.assertTrue(os.path.exists(dowloaded_dir))
|
28 |
shutil.rmtree(f'docs/')
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
if __name__ == '__main__':
|
31 |
unittest.main()
|
|
|
27 |
self.assertTrue(os.path.exists(dowloaded_dir))
|
28 |
shutil.rmtree(f'docs/')
|
29 |
|
30 |
+
def test_distibute_max_papers(self):
|
31 |
+
XRxiv_servers = ['rxiv', 'medrxiv']
|
32 |
+
max_results = 10
|
33 |
+
max_papers_in_server = distibute_max_papers(max_results, XRxiv_servers)
|
34 |
+
self.assertEqual(max_results, np.sum(max_papers_in_server))
|
35 |
+
self.assertEqual(max_papers_in_server[2], 0)
|
36 |
+
self.assertGreater(max_papers_in_server[0],0)
|
37 |
+
self.assertGreater(max_papers_in_server[3],0)
|
38 |
+
|
39 |
if __name__ == '__main__':
|
40 |
unittest.main()
|
utils.py
CHANGED
@@ -8,7 +8,8 @@ import shutil
|
|
8 |
import time
|
9 |
from bs4 import BeautifulSoup as bs
|
10 |
from datetime import datetime
|
11 |
-
from
|
|
|
12 |
|
13 |
|
14 |
class XRxivQuery:
|
@@ -24,6 +25,7 @@ class XRxivQuery:
|
|
24 |
|
25 |
def call_API(self):
|
26 |
search_query = self.search_query.strip().replace(" ", "+").split('+')#.replace(", ", "+").replace(",", "+")#.split('+')
|
|
|
27 |
if 'rxiv' in self.XRxiv_servers:
|
28 |
'''
|
29 |
Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
|
@@ -42,13 +44,10 @@ class XRxivQuery:
|
|
42 |
<category term="stat.TH" scheme="http://arxiv.org/schemas/atom"/>\n
|
43 |
</entry>\n
|
44 |
'''
|
45 |
-
print('Searching Arxiv\n')
|
46 |
# Call arXiv API
|
47 |
journal = 'arXiv'
|
48 |
-
|
49 |
-
|
50 |
-
arXiv_url=f'http://export.arxiv.org/api/query?search_query={self.search_by}:{"+".join(search_query)}&sortBy={self.sort_by}&start=0&max_results={self.max_results}'
|
51 |
-
# print(arXiv_url)
|
52 |
with urllib.request.urlopen(arXiv_url) as url:
|
53 |
s = url.read()
|
54 |
|
@@ -72,7 +71,6 @@ class XRxivQuery:
|
|
72 |
folder_names.append(self.folder_name)
|
73 |
pdf_years.append(pdf.xpath('updated/text()')[0][:4])
|
74 |
pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. {journal} [{pdf_categories[i][0]}] ({pdf_years[i]}), (available at {pdf_urls[i]}).")
|
75 |
-
# self.all_pdf_citation.append(pdf_citation)
|
76 |
pdf_info = list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
|
77 |
self.all_pdf_info.append(pdf_info)
|
78 |
|
@@ -99,13 +97,16 @@ class XRxivQuery:
|
|
99 |
</entry>\n
|
100 |
'''
|
101 |
if 'biorxiv' in self.XRxiv_servers and 'medrxiv' not in self.XRxiv_servers:
|
102 |
-
print('Searching biorxiv\n')
|
|
|
103 |
journals_str = f'%20jcode%3Abiorxiv'
|
104 |
if 'biorxiv' not in self.XRxiv_servers and 'medrxiv' in self.XRxiv_servers:
|
105 |
-
print('Searching medrxiv\n')
|
|
|
106 |
journals_str = f'%20jcode%3Amedrxiv'
|
107 |
if 'biorxiv' in self.XRxiv_servers and 'medrxiv' in self.XRxiv_servers:
|
108 |
-
print('Searching both biorxiv and medrxiv\n')
|
|
|
109 |
journals_str = f'%20jcode%3Abiorxiv%7C%7Cmedrxiv'
|
110 |
|
111 |
subject_str = ('%20').join(self.search_query[0].split())
|
@@ -116,7 +117,7 @@ class XRxivQuery:
|
|
116 |
today = str(current_dateTime)[:10]
|
117 |
start_day = '2013-01-01'
|
118 |
arXiv_url = f'https://www.biorxiv.org/search/'
|
119 |
-
arXiv_url += subject_str + journals_str + f'%20limit_from%3A2{start_day}%20limit_to%3A{today}%20numresults%3A{
|
120 |
|
121 |
url_response = requests.post(arXiv_url)
|
122 |
html = bs(url_response.text, features='html.parser')
|
@@ -145,7 +146,6 @@ class XRxivQuery:
|
|
145 |
self.all_pdf_info.append(pdf_info)
|
146 |
|
147 |
self.all_pdf_info = [item for sublist in self.all_pdf_info for item in sublist]
|
148 |
-
print(self.all_pdf_info)
|
149 |
return self.all_pdf_info
|
150 |
|
151 |
def download_pdf(self):
|
@@ -183,247 +183,11 @@ class XRxivQuery:
|
|
183 |
|
184 |
|
185 |
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
<entry>\n
|
195 |
-
<id>http://arxiv.org/abs/2008.04584v2</id>\n
|
196 |
-
<updated>2021-05-11T12:00:24Z</updated>\n
|
197 |
-
<published>2020-08-11T08:47:06Z</published>\n
|
198 |
-
<title>Bayesian Selective Inference: Non-informative Priors</title>\n
|
199 |
-
<summary> We discuss Bayesian inference for parameters selected using the data. First,\nwe provide a critical analysis of the existing positions in the literature\nregarding the correct Bayesian approach under selection. Second, we propose two\ntypes of non-informative priors for selection models. These priors may be\nemployed to produce a posterior distribution in the absence of prior\ninformation as well as to provide well-calibrated frequentist inference for the\nselected parameter. We test the proposed priors empirically in several\nscenarios.\n</summary>\n
|
200 |
-
<author>\n <name>Daniel G. Rasines</name>\n </author>\n <author>\n <name>G. Alastair Young</name>\n </author>\n
|
201 |
-
<arxiv:comment xmlns:arxiv="http://arxiv.org/schemas/atom">24 pages, 7 figures</arxiv:comment>\n
|
202 |
-
<link href="http://arxiv.org/abs/2008.04584v2" rel="alternate" type="text/html"/>\n
|
203 |
-
<link title="pdf" href="http://arxiv.org/pdf/2008.04584v2" rel="related" type="application/pdf"/>\n
|
204 |
-
<arxiv:primary_category xmlns:arxiv="http://arxiv.org/schemas/atom" term="math.ST" scheme="http://arxiv.org/schemas/atom"/>\n
|
205 |
-
<category term="math.ST" scheme="http://arxiv.org/schemas/atom"/>\n
|
206 |
-
<category term="stat.TH" scheme="http://arxiv.org/schemas/atom"/>\n
|
207 |
-
</entry>\n
|
208 |
-
'''
|
209 |
-
|
210 |
-
# Remove space in seach query
|
211 |
-
search_query=search_query.strip().replace(" ", "+").replace(", ","+").replace(",","+")
|
212 |
-
# Call arXiv API
|
213 |
-
arXiv_url=f'http://export.arxiv.org/api/query?search_query={search_by}:{search_query}&sortBy={sort_by}&start=0&max_results={max_results}'
|
214 |
-
with urllib.request.urlopen(arXiv_url) as url:
|
215 |
-
s = url.read()
|
216 |
-
|
217 |
-
from lxml import html
|
218 |
-
|
219 |
-
# Parse the xml data
|
220 |
-
root = html.fromstring(s)
|
221 |
-
# Fetch relevant pdf information
|
222 |
-
pdf_entries = root.xpath("entry")
|
223 |
-
|
224 |
-
pdf_titles = []
|
225 |
-
pdf_authors = []
|
226 |
-
pdf_urls = []
|
227 |
-
pdf_categories = []
|
228 |
-
folder_names = []
|
229 |
-
pdf_citation = []
|
230 |
-
pdf_years = []
|
231 |
-
|
232 |
-
for i, pdf in enumerate(pdf_entries):
|
233 |
-
pdf_titles.append(re.sub('[^a-zA-Z0-9]', ' ', pdf.xpath("title/text()")[0]))
|
234 |
-
pdf_authors.append(pdf.xpath("author/name/text()"))
|
235 |
-
pdf_urls.append(pdf.xpath("link[@title='pdf']/@href")[0])
|
236 |
-
pdf_categories.append(pdf.xpath("category/@term"))
|
237 |
-
folder_names.append(folder_name)
|
238 |
-
pdf_years.append(pdf.xpath('updated/text()')[0][:4])
|
239 |
-
pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. arXiv [{pdf_categories[i][0]}] ({pdf_years[i]}), (available at {pdf_urls[i]}).")
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
pdf_info=list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
|
244 |
-
|
245 |
-
# Check number of available files
|
246 |
-
if len(pdf_urls)<int(max_results):
|
247 |
-
matching_pdf_num=len(pdf_urls)
|
248 |
-
# print('Only {matching_pdf_num} files available'.format(matching_pdf_num=matching_pdf_num))
|
249 |
-
return pdf_info, pdf_citation
|
250 |
-
|
251 |
-
|
252 |
-
def download_pdf(pdf_info):
|
253 |
-
|
254 |
-
# if len(os.listdir(f'./{folder_name}') ) != 0:
|
255 |
-
# check folder is empty to avoid using papers from old runs:
|
256 |
-
# os.remove(f'./{folder_name}/*')
|
257 |
-
# print(pdf_info)
|
258 |
-
all_reference_text = []
|
259 |
-
for i,p in enumerate(stqdm(pdf_info, desc='Searching and downloading papers')):
|
260 |
-
pdf_title=p[0].replace(':','').replace('/','').replace('.','')
|
261 |
-
pdf_category=p[3]
|
262 |
-
pdf_url=p[1]
|
263 |
-
if pdf_category in ['medRxiv', 'bioRxiv']:
|
264 |
-
pdf_url += '.full.pdf'
|
265 |
-
pdf_author=p[2]
|
266 |
-
folder_name=p[4]
|
267 |
-
pdf_citation=p[5]
|
268 |
-
r = requests.get(pdf_url, allow_redirects=True)
|
269 |
-
if i == 0:
|
270 |
-
if not os.path.exists(f'{folder_name}'):
|
271 |
-
os.makedirs(f"{folder_name}")
|
272 |
-
else:
|
273 |
-
shutil.rmtree(f'{folder_name}')
|
274 |
-
os.makedirs(f"{folder_name}")
|
275 |
-
with open(f'{folder_name}/{pdf_title}.pdf', 'wb') as f:
|
276 |
-
f.write(r.content)
|
277 |
-
if i == 0:
|
278 |
-
st.markdown("###### Papers found:")
|
279 |
-
st.markdown(f"{i+1}. {pdf_citation}")
|
280 |
-
time.sleep(0.15)
|
281 |
-
all_reference_text.append(f"{i+1}. {pdf_citation}\n")
|
282 |
-
if 'all_reference_text' not in st.session_state:
|
283 |
-
st.session_state.key = 'all_reference_text'
|
284 |
-
st.session_state['all_reference_text'] = ' '.join(all_reference_text)
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
def call_bioArXiv_API(search_query, search_by='all', sort_by='relevance', max_results='10', folder_name='docs'):
|
289 |
-
'''
|
290 |
-
Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
|
291 |
-
<li class="first last odd search-result result-jcode-medrxiv search-result-highwire-citation">
|
292 |
-
<div class="highwire-article-citation highwire-citation-type-highwire-article node" data-apath="/medrxiv/early/2021/02/18/2021.02.12.21251663.atom" data-pisa="medrxiv;2021.02.12.21251663v1" data-pisa-master="medrxiv;2021.02.12.21251663" id="node-medrxivearly202102182021021221251663atom1512875027"><div class="highwire-cite highwire-cite-highwire-article highwire-citation-biorxiv-article-pap-list clearfix">
|
293 |
-
<span class="highwire-cite-title">
|
294 |
-
<a class="highwire-cite-linked-title" data-hide-link-title="0" data-icon-position="" href="http://medrxiv.org/content/early/2021/02/18/2021.02.12.21251663">
|
295 |
-
<span class="highwire-cite-title">ClinGen Variant Curation Interface: A Variant Classification Platform for the Application of Evidence Criteria from ACMG/AMP Guidelines</span></a> </span>
|
296 |
-
<div class="highwire-cite-authors"><span class="highwire-citation-authors">
|
297 |
-
<span class="highwire-citation-author first" data-delta="0"><span class="nlm-given-names">Christine G.</span> <span class="nlm-surname">Preston</span></span>,
|
298 |
-
<span class="highwire-citation-author" data-delta="1"><span class="nlm-given-names">Matt W.</span> <span class="nlm-surname">Wright</span></span>,
|
299 |
-
<span class="highwire-citation-author" data-delta="2"><span class="nlm-given-names">Rao</span> <span class="nlm-surname">Madhavrao</span></span>,
|
300 |
-
<div class="highwire-cite-metadata"><span class="highwire-cite-metadata-journal highwire-cite-metadata">medRxiv </span>
|
301 |
-
<span class="highwire-cite-metadata-pages highwire-cite-metadata">2021.02.12.21251663; </span><span class="highwire-cite-metadata-doi highwire-cite-metadata">
|
302 |
-
<span class="doi_label">doi:</span> https://doi.org/10.1101/2021.02.12.21251663 </span></div>
|
303 |
-
<div class="highwire-cite-extras"><div class="hw-make-citation" data-encoded-apath=";medrxiv;early;2021;02;18;2021.02.12.21251663.atom" data-seqnum="0" id="hw-make-citation-0">
|
304 |
-
<a class="link-save-citation-save use-ajax hw-link-save-unsave-catation link-icon" href="/highwire-save-citation/saveapath/%3Bmedrxiv%3Bearly%3B2021%3B02%3B18%3B2021.02.12.21251663.atom/nojs/0" id="link-save-citation-toggle-0" title="Save">
|
305 |
-
<span class="icon-plus"></span> <span class="title">Add to Selected Citations</span></a></div></div>
|
306 |
-
</div>
|
307 |
-
</div></li>
|
308 |
-
</entry>\n
|
309 |
-
'''
|
310 |
-
|
311 |
-
# Remove space in seach query
|
312 |
-
search_query=search_query.strip().replace(", ", "+").replace(" ", "+").replace(",", "+").split('+')
|
313 |
-
subject_str = ('%20').join(search_query[0].split())
|
314 |
-
for subject in search_query[1:]:
|
315 |
-
subject_str = subject_str + '%252B' + ('%20').join(subject.split())
|
316 |
-
|
317 |
-
# print(subject_str)
|
318 |
-
# Call arXiv API
|
319 |
-
# bio_arXiv_url=f'http://export.arxiv.org/api/query?search_query={search_by}:{search_query}&sortBy={sort_by}&start=0&max_results={max_results}'
|
320 |
-
# "https://api.biorxiv.org"
|
321 |
-
current_dateTime = datetime.now()
|
322 |
-
today = str(current_dateTime)[:10]
|
323 |
-
journal = 'biorxiv'
|
324 |
-
# journals_str = '%20jcode%3Amedrxiv%7C%7Cbiorxiv'
|
325 |
-
|
326 |
-
bio_arXiv_url = f'https://www.biorxiv.org/search/'
|
327 |
-
# kwd_str = 'abstract_title%3A' + ('%252C%2B').join([search_query[0]] + [('%2B').join(keyword.split()) for keyword in search_query[1:]])
|
328 |
-
# print(kwd_str)
|
329 |
-
# kwd_str = kwd_str + '%20abstract_title_flags%3Amatch-' + 'all'
|
330 |
-
# bio_arXiv_url += '%20' + kwd_str
|
331 |
-
|
332 |
-
launch_dates = {"biorxiv": "2013-01-01", "medrxiv": "2019-06-01"}
|
333 |
-
|
334 |
-
both = False
|
335 |
-
bio_only = True
|
336 |
-
med_only = False
|
337 |
-
if bio_only:
|
338 |
-
print('https://www.biorxiv.org/search/serverless%252Bcomputing%252Bbioinformatics%20jcode%3Abiorxiv%20limit_from%3A2021-06-13%20limit_to%3A2023-02-17%20numresults%3A25%20sort%3Arelevance-rank%20format_result%3Astandard\n bio_only')
|
339 |
-
journal = 'biorxiv'
|
340 |
-
journals_str = f'%20jcode%3A{journal}'
|
341 |
-
if both:
|
342 |
-
# print('https://www.biorxiv.org/search/serverless%252Bcomputing%252Bbioinformatics%20jcode%3Amedrxiv%7C%7Cbiorxiv%20limit_from%3A2022-11-06%20limit_to%3A2023-02-17%20numresults%3A10%20sort%3Arelevance-rank%20format_result%3Astandard\n both')
|
343 |
-
journal = 'biorxiv'
|
344 |
-
journals_str = f'%20jcode%3A{journal}%7C%7Cmedrxiv'
|
345 |
-
if med_only:
|
346 |
-
# print('https://www.biorxiv.org/search/serverless%252Bcomputing%252Bbioinformatics%20jcode%3Amedrxiv%20limit_from%3A2021-06-13%20limit_to%3A2023-02-17%20numresults%3A10%20sort%3Arelevance-rank%20format_result%3Astandard\n med_only')
|
347 |
-
journal = 'medrxiv'
|
348 |
-
journals_str = f'%20jcode%3A{journal}'
|
349 |
-
start_day = launch_dates[journal]
|
350 |
-
bio_arXiv_url += subject_str + journals_str + f'%20limit_from%3A2{start_day}%20limit_to%3A{today}%20numresults%3A{max_results}%20sort%3Arelevance-rank%20format_result%3Astandard'
|
351 |
-
|
352 |
-
# print(bio_arXiv_url)
|
353 |
-
url_response = requests.post(bio_arXiv_url)
|
354 |
-
html = bs(url_response.text, features='html.parser')
|
355 |
-
pdf_entries = html.find_all(attrs={'class': 'search-result'})
|
356 |
-
# print(articles)
|
357 |
-
|
358 |
-
# with urllib.request.urlopen(bio_arXiv_url) as url:
|
359 |
-
# s = url.read()
|
360 |
-
# # Parse the xml data
|
361 |
-
# root = html.fromstring(s)
|
362 |
-
# # Fetch relevant pdf information
|
363 |
-
# pdf_entries = root.xpath("entry")
|
364 |
-
# print(pdf_entries)
|
365 |
-
pdf_titles = []
|
366 |
-
pdf_authors = []
|
367 |
-
pdf_urls = []
|
368 |
-
pdf_categories = []
|
369 |
-
folder_names = []
|
370 |
-
pdf_citation = []
|
371 |
-
pdf_years = []
|
372 |
-
|
373 |
-
for i, pdf in enumerate(pdf_entries):
|
374 |
-
# print(pdf.xpath('updated/text()')[0][:4])
|
375 |
-
# xpath return a list with every ocurrence of the html path. Since we're getting each entry individually, we'll take the first element to avoid an unecessary list
|
376 |
-
# print(pdf)
|
377 |
-
# [article.find('span', attrs={'class': 'highwire-cite-title'}).text.strip() if article.find('span', attrs={'class': 'highwire-cite-title'}) is not None else None for article in articles]
|
378 |
-
pdf_titles.append(pdf.find('span', attrs={'class': 'highwire-cite-title'}).text.strip())
|
379 |
-
# print(pdf.find('span', attrs={'class': 'highwire-citation-authors'}).text.strip())
|
380 |
-
pdf_authors.append(pdf.find('span', attrs={'class': 'highwire-citation-authors'}).text.strip().split(', '))
|
381 |
-
# print(pdf_authors)
|
382 |
-
|
383 |
-
# print(f'http://www.{journal}.org')
|
384 |
-
pdf_url = pdf.find('a', href=True)['href']
|
385 |
-
if pdf_url[:4] != 'http':
|
386 |
-
pdf_url = f'http://www.biorxiv.org'+ pdf_url
|
387 |
-
pdf_urls.append(pdf_url)
|
388 |
-
pdf_categories.append(pdf.find('span', attrs={'class': 'highwire-cite-metadata-journal highwire-cite-metadata'}).text.strip())
|
389 |
-
# print(pdf_categories)
|
390 |
-
folder_names.append(folder_name)
|
391 |
-
pdf_years.append(pdf.find('span', attrs={'class': 'highwire-cite-metadata-pages highwire-cite-metadata'}).text.strip()[:4])
|
392 |
-
|
393 |
-
pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. {pdf_categories[i]} ({pdf_years[i]}), (available at {pdf_urls[i]}).")
|
394 |
-
# print(pdf_citation)
|
395 |
-
|
396 |
-
# break
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
pdf_info=list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
|
401 |
-
|
402 |
-
# Check number of available files
|
403 |
-
print('Requesting {max_results} files'.format(max_results=max_results))
|
404 |
-
if len(pdf_urls)<int(max_results):
|
405 |
-
matching_pdf_num=len(pdf_urls)
|
406 |
-
print('Only {matching_pdf_num} files available'.format(matching_pdf_num=matching_pdf_num))
|
407 |
-
return pdf_info
|
408 |
-
|
409 |
-
import urllib.request as urllib2
|
410 |
-
|
411 |
-
def download_bio_pdf(pdf_info):
|
412 |
-
for p in tqdm(pdf_info):
|
413 |
-
pdf_title=p[0].replace(':','').replace('/','-').replace('.','')
|
414 |
-
pdf_url=p[1] + '.full.pdf'
|
415 |
-
# print(pdf_url)
|
416 |
-
pdf_author=p[2]
|
417 |
-
pdf_category=p[3]
|
418 |
-
print(pdf_category)
|
419 |
-
folder_name=p[4]
|
420 |
-
pdf_citation=p[5]
|
421 |
-
r = requests.get(pdf_url, allow_redirects=True)
|
422 |
-
# print(r)
|
423 |
-
print(pdf_url)
|
424 |
-
# r = requests.get(pdf_url, stream=True)
|
425 |
-
if not os.path.exists(folder_name):
|
426 |
-
os.makedirs(f"{folder_name}")
|
427 |
-
with open(f'{folder_name}/{pdf_title}.pdf', 'wb') as f:
|
428 |
-
f.write(r.content)
|
429 |
-
|
|
|
8 |
import time
|
9 |
from bs4 import BeautifulSoup as bs
|
10 |
from datetime import datetime
|
11 |
+
from random import uniform as rand
|
12 |
+
import numpy as np
|
13 |
|
14 |
|
15 |
class XRxivQuery:
|
|
|
25 |
|
26 |
def call_API(self):
|
27 |
search_query = self.search_query.strip().replace(" ", "+").split('+')#.replace(", ", "+").replace(",", "+")#.split('+')
|
28 |
+
max_papers_in_server = distibute_max_papers(self.max_results, self.XRxiv_servers)
|
29 |
if 'rxiv' in self.XRxiv_servers:
|
30 |
'''
|
31 |
Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
|
|
|
44 |
<category term="stat.TH" scheme="http://arxiv.org/schemas/atom"/>\n
|
45 |
</entry>\n
|
46 |
'''
|
|
|
47 |
# Call arXiv API
|
48 |
journal = 'arXiv'
|
49 |
+
max_rxiv_papers = max_papers_in_server[0]
|
50 |
+
arXiv_url=f'http://export.arxiv.org/api/query?search_query={self.search_by}:{"+".join(search_query)}&sortBy={self.sort_by}&start=0&max_results={max_rxiv_papers}'
|
|
|
|
|
51 |
with urllib.request.urlopen(arXiv_url) as url:
|
52 |
s = url.read()
|
53 |
|
|
|
71 |
folder_names.append(self.folder_name)
|
72 |
pdf_years.append(pdf.xpath('updated/text()')[0][:4])
|
73 |
pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. {journal} [{pdf_categories[i][0]}] ({pdf_years[i]}), (available at {pdf_urls[i]}).")
|
|
|
74 |
pdf_info = list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
|
75 |
self.all_pdf_info.append(pdf_info)
|
76 |
|
|
|
97 |
</entry>\n
|
98 |
'''
|
99 |
if 'biorxiv' in self.XRxiv_servers and 'medrxiv' not in self.XRxiv_servers:
|
100 |
+
# print('Searching biorxiv\n')
|
101 |
+
max_biorxiv_papers = max_papers_in_server[2]
|
102 |
journals_str = f'%20jcode%3Abiorxiv'
|
103 |
if 'biorxiv' not in self.XRxiv_servers and 'medrxiv' in self.XRxiv_servers:
|
104 |
+
# print('Searching medrxiv\n')
|
105 |
+
max_biorxiv_papers = max_papers_in_server[3]
|
106 |
journals_str = f'%20jcode%3Amedrxiv'
|
107 |
if 'biorxiv' in self.XRxiv_servers and 'medrxiv' in self.XRxiv_servers:
|
108 |
+
# print('Searching both biorxiv and medrxiv\n')
|
109 |
+
max_biorxiv_papers = max_papers_in_server[3]+ max_papers_in_server[2] # birxiv and medrxiv are together.
|
110 |
journals_str = f'%20jcode%3Abiorxiv%7C%7Cmedrxiv'
|
111 |
|
112 |
subject_str = ('%20').join(self.search_query[0].split())
|
|
|
117 |
today = str(current_dateTime)[:10]
|
118 |
start_day = '2013-01-01'
|
119 |
arXiv_url = f'https://www.biorxiv.org/search/'
|
120 |
+
arXiv_url += subject_str + journals_str + f'%20limit_from%3A2{start_day}%20limit_to%3A{today}%20numresults%3A{max_biorxiv_papers}%20sort%3Arelevance-rank%20format_result%3Astandard'
|
121 |
|
122 |
url_response = requests.post(arXiv_url)
|
123 |
html = bs(url_response.text, features='html.parser')
|
|
|
146 |
self.all_pdf_info.append(pdf_info)
|
147 |
|
148 |
self.all_pdf_info = [item for sublist in self.all_pdf_info for item in sublist]
|
|
|
149 |
return self.all_pdf_info
|
150 |
|
151 |
def download_pdf(self):
|
|
|
183 |
|
184 |
|
185 |
|
186 |
+
def distibute_max_papers(max_results, XRxiv_servers):
|
187 |
+
fixed_length = len(XRxiv_servers)
|
188 |
+
sample = np.random.multinomial(max_results - fixed_length, np.ones(fixed_length)/fixed_length, size=1)[0] + 1
|
189 |
+
max_papers_in_server = np.zeros(4, dtype=int)
|
190 |
+
all_servers = ['rxiv', 'chemrxiv', 'biorxiv', 'medrxiv']
|
191 |
+
for i,s in enumerate(XRxiv_servers):
|
192 |
+
max_papers_in_server[all_servers.index(s)] = int(sample[i])
|
193 |
+
return max_papers_in_server
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|