Spaces:
Running
Running
Commit
·
e5ddac0
1
Parent(s):
b06418a
aded arxiv API class
Browse files- test/__pycache__/__init__.cpython-38.pyc +0 -0
- test/__pycache__/test.cpython-38.pyc +0 -0
- utils.py +328 -2
test/__pycache__/__init__.cpython-38.pyc
DELETED
Binary file (151 Bytes)
|
|
test/__pycache__/test.cpython-38.pyc
DELETED
Binary file (1.33 kB)
|
|
utils.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
import urllib
|
2 |
-
from lxml import html
|
3 |
import streamlit as st
|
4 |
import requests
|
5 |
import re
|
@@ -7,8 +6,188 @@ from stqdm import stqdm
|
|
7 |
import os
|
8 |
import shutil
|
9 |
import time
|
|
|
|
|
|
|
10 |
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_results='10', folder_name='docs'):
|
13 |
'''
|
14 |
Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
|
@@ -29,11 +208,13 @@ def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_resul
|
|
29 |
'''
|
30 |
|
31 |
# Remove space in seach query
|
32 |
-
search_query=search_query.strip().replace(" ", "+")
|
33 |
# Call arXiv API
|
34 |
arXiv_url=f'http://export.arxiv.org/api/query?search_query={search_by}:{search_query}&sortBy={sort_by}&start=0&max_results={max_results}'
|
35 |
with urllib.request.urlopen(arXiv_url) as url:
|
36 |
s = url.read()
|
|
|
|
|
37 |
|
38 |
# Parse the xml data
|
39 |
root = html.fromstring(s)
|
@@ -101,3 +282,148 @@ def download_pdf(pdf_info):
|
|
101 |
if 'all_reference_text' not in st.session_state:
|
102 |
st.session_state.key = 'all_reference_text'
|
103 |
st.session_state['all_reference_text'] = ' '.join(all_reference_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import urllib
|
|
|
2 |
import streamlit as st
|
3 |
import requests
|
4 |
import re
|
|
|
6 |
import os
|
7 |
import shutil
|
8 |
import time
|
9 |
+
from bs4 import BeautifulSoup as bs
|
10 |
+
from datetime import datetime
|
11 |
+
from urllib.parse import quote
|
12 |
|
13 |
|
14 |
+
class XRxivQuery:
|
15 |
+
def __init__(self, search_query, max_results, folder_name='docs', XRxiv_servers = [], search_by='all', sort_by='relevance'):
|
16 |
+
self.search_query = search_query
|
17 |
+
self.max_results = max_results
|
18 |
+
self.folder_name = folder_name
|
19 |
+
self.XRxiv_servers = XRxiv_servers
|
20 |
+
self.search_by = search_by
|
21 |
+
self.sort_by = sort_by
|
22 |
+
self.all_pdf_info = []
|
23 |
+
self.all_pdf_citation = []
|
24 |
+
|
25 |
+
def call_API(self):
|
26 |
+
search_query = self.search_query.strip().replace(" ", "+").split('+')#.replace(", ", "+").replace(",", "+")#.split('+')
|
27 |
+
if 'rxiv' in self.XRxiv_servers:
|
28 |
+
'''
|
29 |
+
Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
|
30 |
+
<entry>\n
|
31 |
+
<id>http://arxiv.org/abs/2008.04584v2</id>\n
|
32 |
+
<updated>2021-05-11T12:00:24Z</updated>\n
|
33 |
+
<published>2020-08-11T08:47:06Z</published>\n
|
34 |
+
<title>Bayesian Selective Inference: Non-informative Priors</title>\n
|
35 |
+
<summary> We discuss Bayesian inference for parameters selected using the data. First,\nwe provide a critical analysis of the existing positions in the literature\nregarding the correct Bayesian approach under selection. Second, we propose two\ntypes of non-informative priors for selection models. These priors may be\nemployed to produce a posterior distribution in the absence of prior\ninformation as well as to provide well-calibrated frequentist inference for the\nselected parameter. We test the proposed priors empirically in several\nscenarios.\n</summary>\n
|
36 |
+
<author>\n <name>Daniel G. Rasines</name>\n </author>\n <author>\n <name>G. Alastair Young</name>\n </author>\n
|
37 |
+
<arxiv:comment xmlns:arxiv="http://arxiv.org/schemas/atom">24 pages, 7 figures</arxiv:comment>\n
|
38 |
+
<link href="http://arxiv.org/abs/2008.04584v2" rel="alternate" type="text/html"/>\n
|
39 |
+
<link title="pdf" href="http://arxiv.org/pdf/2008.04584v2" rel="related" type="application/pdf"/>\n
|
40 |
+
<arxiv:primary_category xmlns:arxiv="http://arxiv.org/schemas/atom" term="math.ST" scheme="http://arxiv.org/schemas/atom"/>\n
|
41 |
+
<category term="math.ST" scheme="http://arxiv.org/schemas/atom"/>\n
|
42 |
+
<category term="stat.TH" scheme="http://arxiv.org/schemas/atom"/>\n
|
43 |
+
</entry>\n
|
44 |
+
'''
|
45 |
+
print('Searching Arxiv\n')
|
46 |
+
# Call arXiv API
|
47 |
+
journal = 'arXiv'
|
48 |
+
# print(" ".join(search_query))
|
49 |
+
# print(self.search_query.strip().replace(", ", "+").replace(" ", "+").replace(",", "+"))
|
50 |
+
arXiv_url=f'http://export.arxiv.org/api/query?search_query={self.search_by}:{"+".join(search_query)}&sortBy={self.sort_by}&start=0&max_results={self.max_results}'
|
51 |
+
# print(arXiv_url)
|
52 |
+
with urllib.request.urlopen(arXiv_url) as url:
|
53 |
+
s = url.read()
|
54 |
+
|
55 |
+
# Parse the xml data
|
56 |
+
from lxml import html
|
57 |
+
root = html.fromstring(s)
|
58 |
+
# Fetch relevant pdf information
|
59 |
+
pdf_entries = root.xpath("entry")
|
60 |
+
pdf_titles = []
|
61 |
+
pdf_authors = []
|
62 |
+
pdf_urls = []
|
63 |
+
pdf_categories = []
|
64 |
+
folder_names = []
|
65 |
+
pdf_citation = []
|
66 |
+
pdf_years = []
|
67 |
+
for i, pdf in enumerate(pdf_entries):
|
68 |
+
pdf_titles.append(re.sub('[^a-zA-Z0-9]', ' ', pdf.xpath("title/text()")[0]))
|
69 |
+
pdf_authors.append(pdf.xpath("author/name/text()"))
|
70 |
+
pdf_urls.append(pdf.xpath("link[@title='pdf']/@href")[0])
|
71 |
+
pdf_categories.append(pdf.xpath("category/@term"))
|
72 |
+
folder_names.append(self.folder_name)
|
73 |
+
pdf_years.append(pdf.xpath('updated/text()')[0][:4])
|
74 |
+
pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. {journal} [{pdf_categories[i][0]}] ({pdf_years[i]}), (available at {pdf_urls[i]}).")
|
75 |
+
# self.all_pdf_citation.append(pdf_citation)
|
76 |
+
pdf_info = list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
|
77 |
+
self.all_pdf_info.append(pdf_info)
|
78 |
+
|
79 |
+
if 'biorxiv' in self.XRxiv_servers or 'medrxiv' in self.XRxiv_servers:
|
80 |
+
'''
|
81 |
+
Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
|
82 |
+
<li class="first last odd search-result result-jcode-medrxiv search-result-highwire-citation">
|
83 |
+
<div class="highwire-article-citation highwire-citation-type-highwire-article node" data-apath="/medrxiv/early/2021/02/18/2021.02.12.21251663.atom" data-pisa="medrxiv;2021.02.12.21251663v1" data-pisa-master="medrxiv;2021.02.12.21251663" id="node-medrxivearly202102182021021221251663atom1512875027"><div class="highwire-cite highwire-cite-highwire-article highwire-citation-biorxiv-article-pap-list clearfix">
|
84 |
+
<span class="highwire-cite-title">
|
85 |
+
<a class="highwire-cite-linked-title" data-hide-link-title="0" data-icon-position="" href="http://medrxiv.org/content/early/2021/02/18/2021.02.12.21251663">
|
86 |
+
<span class="highwire-cite-title">ClinGen Variant Curation Interface: A Variant Classification Platform for the Application of Evidence Criteria from ACMG/AMP Guidelines</span></a> </span>
|
87 |
+
<div class="highwire-cite-authors"><span class="highwire-citation-authors">
|
88 |
+
<span class="highwire-citation-author first" data-delta="0"><span class="nlm-given-names">Christine G.</span> <span class="nlm-surname">Preston</span></span>,
|
89 |
+
<span class="highwire-citation-author" data-delta="1"><span class="nlm-given-names">Matt W.</span> <span class="nlm-surname">Wright</span></span>,
|
90 |
+
<span class="highwire-citation-author" data-delta="2"><span class="nlm-given-names">Rao</span> <span class="nlm-surname">Madhavrao</span></span>,
|
91 |
+
<div class="highwire-cite-metadata"><span class="highwire-cite-metadata-journal highwire-cite-metadata">medRxiv </span>
|
92 |
+
<span class="highwire-cite-metadata-pages highwire-cite-metadata">2021.02.12.21251663; </span><span class="highwire-cite-metadata-doi highwire-cite-metadata">
|
93 |
+
<span class="doi_label">doi:</span> https://doi.org/10.1101/2021.02.12.21251663 </span></div>
|
94 |
+
<div class="highwire-cite-extras"><div class="hw-make-citation" data-encoded-apath=";medrxiv;early;2021;02;18;2021.02.12.21251663.atom" data-seqnum="0" id="hw-make-citation-0">
|
95 |
+
<a class="link-save-citation-save use-ajax hw-link-save-unsave-catation link-icon" href="/highwire-save-citation/saveapath/%3Bmedrxiv%3Bearly%3B2021%3B02%3B18%3B2021.02.12.21251663.atom/nojs/0" id="link-save-citation-toggle-0" title="Save">
|
96 |
+
<span class="icon-plus"></span> <span class="title">Add to Selected Citations</span></a></div></div>
|
97 |
+
</div>
|
98 |
+
</div></li>
|
99 |
+
</entry>\n
|
100 |
+
'''
|
101 |
+
if 'biorxiv' in self.XRxiv_servers and 'medrxiv' not in self.XRxiv_servers:
|
102 |
+
print('Searching biorxiv\n')
|
103 |
+
journals_str = f'%20jcode%3Abiorxiv'
|
104 |
+
if 'biorxiv' not in self.XRxiv_servers and 'medrxiv' in self.XRxiv_servers:
|
105 |
+
print('Searching medrxiv\n')
|
106 |
+
journals_str = f'%20jcode%3Amedrxiv'
|
107 |
+
if 'biorxiv' in self.XRxiv_servers and 'medrxiv' in self.XRxiv_servers:
|
108 |
+
print('Searching both biorxiv and medrxiv\n')
|
109 |
+
journals_str = f'%20jcode%3Abiorxiv%7C%7Cmedrxiv'
|
110 |
+
|
111 |
+
subject_str = ('%20').join(self.search_query[0].split())
|
112 |
+
for subject in search_query[1:]:
|
113 |
+
subject_str = subject_str + '%252B' + ('%20').join(subject.split())
|
114 |
+
|
115 |
+
current_dateTime = datetime.now()
|
116 |
+
today = str(current_dateTime)[:10]
|
117 |
+
start_day = '2013-01-01'
|
118 |
+
arXiv_url = f'https://www.biorxiv.org/search/'
|
119 |
+
arXiv_url += subject_str + journals_str + f'%20limit_from%3A2{start_day}%20limit_to%3A{today}%20numresults%3A{self.max_results}%20sort%3Arelevance-rank%20format_result%3Astandard'
|
120 |
+
|
121 |
+
url_response = requests.post(arXiv_url)
|
122 |
+
html = bs(url_response.text, features='html.parser')
|
123 |
+
pdf_entries = html.find_all(attrs={'class': 'search-result'})
|
124 |
+
pdf_titles = []
|
125 |
+
pdf_authors = []
|
126 |
+
pdf_urls = []
|
127 |
+
pdf_categories = []
|
128 |
+
folder_names = []
|
129 |
+
pdf_citation = []
|
130 |
+
pdf_years = []
|
131 |
+
for i, pdf in enumerate(pdf_entries):
|
132 |
+
pdf_titles.append(pdf.find('span', attrs={'class': 'highwire-cite-title'}).text.strip())
|
133 |
+
pdf_authors.append(pdf.find('span', attrs={'class': 'highwire-citation-authors'}).text.strip().split(', '))
|
134 |
+
|
135 |
+
pdf_url = pdf.find('a', href=True)['href']
|
136 |
+
if pdf_url[:4] != 'http':
|
137 |
+
pdf_url = f'http://www.biorxiv.org'+ pdf_url
|
138 |
+
pdf_urls.append(pdf_url)
|
139 |
+
pdf_categories.append(pdf.find('span', attrs={'class': 'highwire-cite-metadata-journal highwire-cite-metadata'}).text.strip())
|
140 |
+
folder_names.append(self.folder_name)
|
141 |
+
pdf_years.append(pdf.find('span', attrs={'class': 'highwire-cite-metadata-pages highwire-cite-metadata'}).text.strip()[:4])
|
142 |
+
pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. {pdf_categories[i]} ({pdf_years[i]}), (available at {pdf_urls[i]}).")
|
143 |
+
|
144 |
+
pdf_info = list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
|
145 |
+
self.all_pdf_info.append(pdf_info)
|
146 |
+
|
147 |
+
self.all_pdf_info = [item for sublist in self.all_pdf_info for item in sublist]
|
148 |
+
print(self.all_pdf_info)
|
149 |
+
return self.all_pdf_info
|
150 |
+
|
151 |
+
def download_pdf(self):
|
152 |
+
# if len(os.listdir(f'./{folder_name}') ) != 0:
|
153 |
+
# check folder is empty to avoid using papers from old runs:
|
154 |
+
# os.remove(f'./{folder_name}/*')
|
155 |
+
# print(pdf_info)
|
156 |
+
all_reference_text = []
|
157 |
+
for i,p in enumerate(stqdm(self.all_pdf_info, desc='Searching and downloading papers')):
|
158 |
+
pdf_title=p[0]
|
159 |
+
pdf_category=p[3]
|
160 |
+
pdf_url=p[1]
|
161 |
+
if pdf_category in ['medRxiv', 'bioRxiv']:
|
162 |
+
pdf_url += '.full.pdf'
|
163 |
+
pdf_file_name=p[0].replace(':','').replace('/','').replace('.','')
|
164 |
+
folder_name=p[4]
|
165 |
+
pdf_citation=p[5]
|
166 |
+
r = requests.get(pdf_url, allow_redirects=True)
|
167 |
+
if i == 0:
|
168 |
+
if not os.path.exists(f'{folder_name}'):
|
169 |
+
os.makedirs(f"{folder_name}")
|
170 |
+
else:
|
171 |
+
shutil.rmtree(f'{folder_name}')
|
172 |
+
os.makedirs(f"{folder_name}")
|
173 |
+
with open(f'{folder_name}/{pdf_file_name}.pdf', 'wb') as f:
|
174 |
+
f.write(r.content)
|
175 |
+
if i == 0:
|
176 |
+
st.markdown("###### Papers found:")
|
177 |
+
st.markdown(f"{i+1}. {pdf_citation}")
|
178 |
+
time.sleep(0.15)
|
179 |
+
all_reference_text.append(f"{i+1}. {pdf_citation}\n")
|
180 |
+
if 'all_reference_text' not in st.session_state:
|
181 |
+
st.session_state.key = 'all_reference_text'
|
182 |
+
st.session_state['all_reference_text'] = ' '.join(all_reference_text)
|
183 |
+
|
184 |
+
|
185 |
+
|
186 |
+
|
187 |
+
|
188 |
+
|
189 |
+
|
190 |
+
|
191 |
def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_results='10', folder_name='docs'):
|
192 |
'''
|
193 |
Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
|
|
|
208 |
'''
|
209 |
|
210 |
# Remove space in seach query
|
211 |
+
search_query=search_query.strip().replace(" ", "+").replace(", ","+").replace(",","+")
|
212 |
# Call arXiv API
|
213 |
arXiv_url=f'http://export.arxiv.org/api/query?search_query={search_by}:{search_query}&sortBy={sort_by}&start=0&max_results={max_results}'
|
214 |
with urllib.request.urlopen(arXiv_url) as url:
|
215 |
s = url.read()
|
216 |
+
|
217 |
+
from lxml import html
|
218 |
|
219 |
# Parse the xml data
|
220 |
root = html.fromstring(s)
|
|
|
282 |
if 'all_reference_text' not in st.session_state:
|
283 |
st.session_state.key = 'all_reference_text'
|
284 |
st.session_state['all_reference_text'] = ' '.join(all_reference_text)
|
285 |
+
|
286 |
+
|
287 |
+
|
288 |
+
def call_bioArXiv_API(search_query, search_by='all', sort_by='relevance', max_results='10', folder_name='docs'):
|
289 |
+
'''
|
290 |
+
Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
|
291 |
+
<li class="first last odd search-result result-jcode-medrxiv search-result-highwire-citation">
|
292 |
+
<div class="highwire-article-citation highwire-citation-type-highwire-article node" data-apath="/medrxiv/early/2021/02/18/2021.02.12.21251663.atom" data-pisa="medrxiv;2021.02.12.21251663v1" data-pisa-master="medrxiv;2021.02.12.21251663" id="node-medrxivearly202102182021021221251663atom1512875027"><div class="highwire-cite highwire-cite-highwire-article highwire-citation-biorxiv-article-pap-list clearfix">
|
293 |
+
<span class="highwire-cite-title">
|
294 |
+
<a class="highwire-cite-linked-title" data-hide-link-title="0" data-icon-position="" href="http://medrxiv.org/content/early/2021/02/18/2021.02.12.21251663">
|
295 |
+
<span class="highwire-cite-title">ClinGen Variant Curation Interface: A Variant Classification Platform for the Application of Evidence Criteria from ACMG/AMP Guidelines</span></a> </span>
|
296 |
+
<div class="highwire-cite-authors"><span class="highwire-citation-authors">
|
297 |
+
<span class="highwire-citation-author first" data-delta="0"><span class="nlm-given-names">Christine G.</span> <span class="nlm-surname">Preston</span></span>,
|
298 |
+
<span class="highwire-citation-author" data-delta="1"><span class="nlm-given-names">Matt W.</span> <span class="nlm-surname">Wright</span></span>,
|
299 |
+
<span class="highwire-citation-author" data-delta="2"><span class="nlm-given-names">Rao</span> <span class="nlm-surname">Madhavrao</span></span>,
|
300 |
+
<div class="highwire-cite-metadata"><span class="highwire-cite-metadata-journal highwire-cite-metadata">medRxiv </span>
|
301 |
+
<span class="highwire-cite-metadata-pages highwire-cite-metadata">2021.02.12.21251663; </span><span class="highwire-cite-metadata-doi highwire-cite-metadata">
|
302 |
+
<span class="doi_label">doi:</span> https://doi.org/10.1101/2021.02.12.21251663 </span></div>
|
303 |
+
<div class="highwire-cite-extras"><div class="hw-make-citation" data-encoded-apath=";medrxiv;early;2021;02;18;2021.02.12.21251663.atom" data-seqnum="0" id="hw-make-citation-0">
|
304 |
+
<a class="link-save-citation-save use-ajax hw-link-save-unsave-catation link-icon" href="/highwire-save-citation/saveapath/%3Bmedrxiv%3Bearly%3B2021%3B02%3B18%3B2021.02.12.21251663.atom/nojs/0" id="link-save-citation-toggle-0" title="Save">
|
305 |
+
<span class="icon-plus"></span> <span class="title">Add to Selected Citations</span></a></div></div>
|
306 |
+
</div>
|
307 |
+
</div></li>
|
308 |
+
</entry>\n
|
309 |
+
'''
|
310 |
+
|
311 |
+
# Remove space in seach query
|
312 |
+
search_query=search_query.strip().replace(", ", "+").replace(" ", "+").replace(",", "+").split('+')
|
313 |
+
subject_str = ('%20').join(search_query[0].split())
|
314 |
+
for subject in search_query[1:]:
|
315 |
+
subject_str = subject_str + '%252B' + ('%20').join(subject.split())
|
316 |
+
|
317 |
+
# print(subject_str)
|
318 |
+
# Call arXiv API
|
319 |
+
# bio_arXiv_url=f'http://export.arxiv.org/api/query?search_query={search_by}:{search_query}&sortBy={sort_by}&start=0&max_results={max_results}'
|
320 |
+
# "https://api.biorxiv.org"
|
321 |
+
current_dateTime = datetime.now()
|
322 |
+
today = str(current_dateTime)[:10]
|
323 |
+
journal = 'biorxiv'
|
324 |
+
# journals_str = '%20jcode%3Amedrxiv%7C%7Cbiorxiv'
|
325 |
+
|
326 |
+
bio_arXiv_url = f'https://www.biorxiv.org/search/'
|
327 |
+
# kwd_str = 'abstract_title%3A' + ('%252C%2B').join([search_query[0]] + [('%2B').join(keyword.split()) for keyword in search_query[1:]])
|
328 |
+
# print(kwd_str)
|
329 |
+
# kwd_str = kwd_str + '%20abstract_title_flags%3Amatch-' + 'all'
|
330 |
+
# bio_arXiv_url += '%20' + kwd_str
|
331 |
+
|
332 |
+
launch_dates = {"biorxiv": "2013-01-01", "medrxiv": "2019-06-01"}
|
333 |
+
|
334 |
+
both = False
|
335 |
+
bio_only = True
|
336 |
+
med_only = False
|
337 |
+
if bio_only:
|
338 |
+
print('https://www.biorxiv.org/search/serverless%252Bcomputing%252Bbioinformatics%20jcode%3Abiorxiv%20limit_from%3A2021-06-13%20limit_to%3A2023-02-17%20numresults%3A25%20sort%3Arelevance-rank%20format_result%3Astandard\n bio_only')
|
339 |
+
journal = 'biorxiv'
|
340 |
+
journals_str = f'%20jcode%3A{journal}'
|
341 |
+
if both:
|
342 |
+
# print('https://www.biorxiv.org/search/serverless%252Bcomputing%252Bbioinformatics%20jcode%3Amedrxiv%7C%7Cbiorxiv%20limit_from%3A2022-11-06%20limit_to%3A2023-02-17%20numresults%3A10%20sort%3Arelevance-rank%20format_result%3Astandard\n both')
|
343 |
+
journal = 'biorxiv'
|
344 |
+
journals_str = f'%20jcode%3A{journal}%7C%7Cmedrxiv'
|
345 |
+
if med_only:
|
346 |
+
# print('https://www.biorxiv.org/search/serverless%252Bcomputing%252Bbioinformatics%20jcode%3Amedrxiv%20limit_from%3A2021-06-13%20limit_to%3A2023-02-17%20numresults%3A10%20sort%3Arelevance-rank%20format_result%3Astandard\n med_only')
|
347 |
+
journal = 'medrxiv'
|
348 |
+
journals_str = f'%20jcode%3A{journal}'
|
349 |
+
start_day = launch_dates[journal]
|
350 |
+
bio_arXiv_url += subject_str + journals_str + f'%20limit_from%3A2{start_day}%20limit_to%3A{today}%20numresults%3A{max_results}%20sort%3Arelevance-rank%20format_result%3Astandard'
|
351 |
+
|
352 |
+
# print(bio_arXiv_url)
|
353 |
+
url_response = requests.post(bio_arXiv_url)
|
354 |
+
html = bs(url_response.text, features='html.parser')
|
355 |
+
pdf_entries = html.find_all(attrs={'class': 'search-result'})
|
356 |
+
# print(articles)
|
357 |
+
|
358 |
+
# with urllib.request.urlopen(bio_arXiv_url) as url:
|
359 |
+
# s = url.read()
|
360 |
+
# # Parse the xml data
|
361 |
+
# root = html.fromstring(s)
|
362 |
+
# # Fetch relevant pdf information
|
363 |
+
# pdf_entries = root.xpath("entry")
|
364 |
+
# print(pdf_entries)
|
365 |
+
pdf_titles = []
|
366 |
+
pdf_authors = []
|
367 |
+
pdf_urls = []
|
368 |
+
pdf_categories = []
|
369 |
+
folder_names = []
|
370 |
+
pdf_citation = []
|
371 |
+
pdf_years = []
|
372 |
+
|
373 |
+
for i, pdf in enumerate(pdf_entries):
|
374 |
+
# print(pdf.xpath('updated/text()')[0][:4])
|
375 |
+
# xpath return a list with every ocurrence of the html path. Since we're getting each entry individually, we'll take the first element to avoid an unecessary list
|
376 |
+
# print(pdf)
|
377 |
+
# [article.find('span', attrs={'class': 'highwire-cite-title'}).text.strip() if article.find('span', attrs={'class': 'highwire-cite-title'}) is not None else None for article in articles]
|
378 |
+
pdf_titles.append(pdf.find('span', attrs={'class': 'highwire-cite-title'}).text.strip())
|
379 |
+
# print(pdf.find('span', attrs={'class': 'highwire-citation-authors'}).text.strip())
|
380 |
+
pdf_authors.append(pdf.find('span', attrs={'class': 'highwire-citation-authors'}).text.strip().split(', '))
|
381 |
+
# print(pdf_authors)
|
382 |
+
|
383 |
+
# print(f'http://www.{journal}.org')
|
384 |
+
pdf_url = pdf.find('a', href=True)['href']
|
385 |
+
if pdf_url[:4] != 'http':
|
386 |
+
pdf_url = f'http://www.biorxiv.org'+ pdf_url
|
387 |
+
pdf_urls.append(pdf_url)
|
388 |
+
pdf_categories.append(pdf.find('span', attrs={'class': 'highwire-cite-metadata-journal highwire-cite-metadata'}).text.strip())
|
389 |
+
# print(pdf_categories)
|
390 |
+
folder_names.append(folder_name)
|
391 |
+
pdf_years.append(pdf.find('span', attrs={'class': 'highwire-cite-metadata-pages highwire-cite-metadata'}).text.strip()[:4])
|
392 |
+
|
393 |
+
pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. {pdf_categories[i]} ({pdf_years[i]}), (available at {pdf_urls[i]}).")
|
394 |
+
# print(pdf_citation)
|
395 |
+
|
396 |
+
# break
|
397 |
+
|
398 |
+
|
399 |
+
|
400 |
+
pdf_info=list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
|
401 |
+
|
402 |
+
# Check number of available files
|
403 |
+
print('Requesting {max_results} files'.format(max_results=max_results))
|
404 |
+
if len(pdf_urls)<int(max_results):
|
405 |
+
matching_pdf_num=len(pdf_urls)
|
406 |
+
print('Only {matching_pdf_num} files available'.format(matching_pdf_num=matching_pdf_num))
|
407 |
+
return pdf_info
|
408 |
+
|
409 |
+
import urllib.request as urllib2
|
410 |
+
|
411 |
+
def download_bio_pdf(pdf_info):
|
412 |
+
for p in tqdm(pdf_info):
|
413 |
+
pdf_title=p[0].replace(':','').replace('/','-').replace('.','')
|
414 |
+
pdf_url=p[1] + '.full.pdf'
|
415 |
+
# print(pdf_url)
|
416 |
+
pdf_author=p[2]
|
417 |
+
pdf_category=p[3]
|
418 |
+
print(pdf_category)
|
419 |
+
folder_name=p[4]
|
420 |
+
pdf_citation=p[5]
|
421 |
+
r = requests.get(pdf_url, allow_redirects=True)
|
422 |
+
# print(r)
|
423 |
+
print(pdf_url)
|
424 |
+
# r = requests.get(pdf_url, stream=True)
|
425 |
+
if not os.path.exists(folder_name):
|
426 |
+
os.makedirs(f"{folder_name}")
|
427 |
+
with open(f'{folder_name}/{pdf_title}.pdf', 'wb') as f:
|
428 |
+
f.write(r.content)
|
429 |
+
|