Spaces:

mehradans92
/

decode-elm

Sleeping

App Files Files Community

mehradans92 commited on Feb 18, 2023

Commit

e5ddac0

1 Parent(s): b06418a

aded arxiv API class

Browse files

Files changed (3) hide show

test/__pycache__/__init__.cpython-38.pyc +0 -0
test/__pycache__/test.cpython-38.pyc +0 -0
utils.py +328 -2

test/__pycache__/__init__.cpython-38.pyc DELETED Viewed

Binary file (151 Bytes)

test/__pycache__/test.cpython-38.pyc DELETED Viewed

Binary file (1.33 kB)

utils.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import urllib
-from lxml import html
 import streamlit as st
 import requests
 import re
@@ -7,8 +6,188 @@ from stqdm import stqdm
 import os
 import shutil
 import time
 def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_results='10', folder_name='docs'):
     '''
       Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
@@ -29,11 +208,13 @@ def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_resul
     '''
     # Remove space in seach query
-    search_query=search_query.strip().replace(" ", "+")
     # Call arXiv API
     arXiv_url=f'http://export.arxiv.org/api/query?search_query={search_by}:{search_query}&sortBy={sort_by}&start=0&max_results={max_results}'
     with urllib.request.urlopen(arXiv_url) as url:
         s = url.read()
     # Parse the xml data
     root = html.fromstring(s)
@@ -101,3 +282,148 @@ def download_pdf(pdf_info):
     if 'all_reference_text' not in st.session_state:
         st.session_state.key = 'all_reference_text'
     st.session_state['all_reference_text'] = ' '.join(all_reference_text)

 import urllib
 import streamlit as st
 import requests
 import re
 import os
 import shutil
 import time
+from bs4 import BeautifulSoup as bs
+from datetime import datetime
+from urllib.parse import quote
+class XRxivQuery:
+    def __init__(self, search_query, max_results, folder_name='docs', XRxiv_servers = [], search_by='all', sort_by='relevance'):
+       self.search_query = search_query
+       self.max_results = max_results
+       self.folder_name = folder_name
+       self.XRxiv_servers = XRxiv_servers
+       self.search_by = search_by
+       self.sort_by = sort_by
+       self.all_pdf_info = []
+       self.all_pdf_citation = []
+    def call_API(self):
+        search_query = self.search_query.strip().replace(" ", "+").split('+')#.replace(", ", "+").replace(",", "+")#.split('+')
+        if 'rxiv' in self.XRxiv_servers:
+            '''
+            Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
+            <entry>\n
+            <id>http://arxiv.org/abs/2008.04584v2</id>\n
+            <updated>2021-05-11T12:00:24Z</updated>\n
+            <published>2020-08-11T08:47:06Z</published>\n
+            <title>Bayesian Selective Inference: Non-informative Priors</title>\n
+            <summary>  We discuss Bayesian inference for parameters selected using the data. First,\nwe provide a critical analysis of the existing positions in the literature\nregarding the correct Bayesian approach under selection. Second, we propose two\ntypes of non-informative priors for selection models. These priors may be\nemployed to produce a posterior distribution in the absence of prior\ninformation as well as to provide well-calibrated frequentist inference for the\nselected parameter. We test the proposed priors empirically in several\nscenarios.\n</summary>\n
+            <author>\n      <name>Daniel G. Rasines</name>\n    </author>\n    <author>\n      <name>G. Alastair Young</name>\n    </author>\n
+            <arxiv:comment xmlns:arxiv="http://arxiv.org/schemas/atom">24 pages, 7 figures</arxiv:comment>\n
+            <link href="http://arxiv.org/abs/2008.04584v2" rel="alternate" type="text/html"/>\n
+            <link title="pdf" href="http://arxiv.org/pdf/2008.04584v2" rel="related" type="application/pdf"/>\n
+            <arxiv:primary_category xmlns:arxiv="http://arxiv.org/schemas/atom" term="math.ST" scheme="http://arxiv.org/schemas/atom"/>\n
+            <category term="math.ST" scheme="http://arxiv.org/schemas/atom"/>\n
+            <category term="stat.TH" scheme="http://arxiv.org/schemas/atom"/>\n
+            </entry>\n
+            '''
+            print('Searching Arxiv\n')
+            # Call arXiv API
+            journal = 'arXiv'
+            # print(" ".join(search_query))
+            # print(self.search_query.strip().replace(", ", "+").replace(" ", "+").replace(",", "+"))
+            arXiv_url=f'http://export.arxiv.org/api/query?search_query={self.search_by}:{"+".join(search_query)}&sortBy={self.sort_by}&start=0&max_results={self.max_results}'
+            # print(arXiv_url)
+            with urllib.request.urlopen(arXiv_url) as url:
+                s = url.read()
+            # Parse the xml data
+            from lxml import html
+            root = html.fromstring(s)
+            # Fetch relevant pdf information
+            pdf_entries = root.xpath("entry")
+            pdf_titles   = []
+            pdf_authors  = []
+            pdf_urls     = []
+            pdf_categories = []
+            folder_names = []
+            pdf_citation = []
+            pdf_years = []
+            for i, pdf in enumerate(pdf_entries):
+                pdf_titles.append(re.sub('[^a-zA-Z0-9]', ' ', pdf.xpath("title/text()")[0]))
+                pdf_authors.append(pdf.xpath("author/name/text()"))
+                pdf_urls.append(pdf.xpath("link[@title='pdf']/@href")[0])
+                pdf_categories.append(pdf.xpath("category/@term"))
+                folder_names.append(self.folder_name)
+                pdf_years.append(pdf.xpath('updated/text()')[0][:4])
+                pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. {journal} [{pdf_categories[i][0]}] ({pdf_years[i]}), (available at {pdf_urls[i]}).")
+            # self.all_pdf_citation.append(pdf_citation)
+            pdf_info = list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
+            self.all_pdf_info.append(pdf_info)
+        if 'biorxiv' in self.XRxiv_servers or 'medrxiv' in self.XRxiv_servers:
+            '''
+            Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
+            <li class="first last odd search-result result-jcode-medrxiv search-result-highwire-citation">
+            <div class="highwire-article-citation highwire-citation-type-highwire-article node" data-apath="/medrxiv/early/2021/02/18/2021.02.12.21251663.atom" data-pisa="medrxiv;2021.02.12.21251663v1" data-pisa-master="medrxiv;2021.02.12.21251663" id="node-medrxivearly202102182021021221251663atom1512875027"><div class="highwire-cite highwire-cite-highwire-article highwire-citation-biorxiv-article-pap-list clearfix">
+            <span class="highwire-cite-title">
+            <a class="highwire-cite-linked-title" data-hide-link-title="0" data-icon-position="" href="http://medrxiv.org/content/early/2021/02/18/2021.02.12.21251663">
+            <span class="highwire-cite-title">ClinGen Variant Curation Interface: A Variant Classification Platform for the Application of Evidence Criteria from ACMG/AMP Guidelines</span></a> </span>
+            <div class="highwire-cite-authors"><span class="highwire-citation-authors">
+            <span class="highwire-citation-author first" data-delta="0"><span class="nlm-given-names">Christine G.</span> <span class="nlm-surname">Preston</span></span>,
+            <span class="highwire-citation-author" data-delta="1"><span class="nlm-given-names">Matt W.</span> <span class="nlm-surname">Wright</span></span>,
+            <span class="highwire-citation-author" data-delta="2"><span class="nlm-given-names">Rao</span> <span class="nlm-surname">Madhavrao</span></span>,
+            <div class="highwire-cite-metadata"><span class="highwire-cite-metadata-journal highwire-cite-metadata">medRxiv </span>
+            <span class="highwire-cite-metadata-pages highwire-cite-metadata">2021.02.12.21251663; </span><span class="highwire-cite-metadata-doi highwire-cite-metadata">
+            <span class="doi_label">doi:</span> https://doi.org/10.1101/2021.02.12.21251663 </span></div>
+            <div class="highwire-cite-extras"><div class="hw-make-citation" data-encoded-apath=";medrxiv;early;2021;02;18;2021.02.12.21251663.atom" data-seqnum="0" id="hw-make-citation-0">
+            <a class="link-save-citation-save use-ajax hw-link-save-unsave-catation link-icon" href="/highwire-save-citation/saveapath/%3Bmedrxiv%3Bearly%3B2021%3B02%3B18%3B2021.02.12.21251663.atom/nojs/0" id="link-save-citation-toggle-0" title="Save">
+            <span class="icon-plus"></span> <span class="title">Add to Selected Citations</span></a></div></div>
+            </div>
+            </div></li>
+            </entry>\n
+            '''
+            if 'biorxiv' in self.XRxiv_servers and 'medrxiv' not in self.XRxiv_servers:
+                print('Searching biorxiv\n')
+                journals_str = f'%20jcode%3Abiorxiv'
+            if 'biorxiv' not in self.XRxiv_servers and 'medrxiv' in self.XRxiv_servers:
+                print('Searching medrxiv\n')
+                journals_str = f'%20jcode%3Amedrxiv'
+            if 'biorxiv' in self.XRxiv_servers and 'medrxiv' in self.XRxiv_servers:
+                print('Searching both biorxiv and medrxiv\n')
+                journals_str = f'%20jcode%3Abiorxiv%7C%7Cmedrxiv'
+            subject_str = ('%20').join(self.search_query[0].split())
+            for subject in search_query[1:]:
+                subject_str = subject_str + '%252B' + ('%20').join(subject.split())
+            current_dateTime = datetime.now()
+            today = str(current_dateTime)[:10]
+            start_day = '2013-01-01'
+            arXiv_url = f'https://www.biorxiv.org/search/'
+            arXiv_url += subject_str + journals_str + f'%20limit_from%3A2{start_day}%20limit_to%3A{today}%20numresults%3A{self.max_results}%20sort%3Arelevance-rank%20format_result%3Astandard'
+            url_response = requests.post(arXiv_url)
+            html = bs(url_response.text, features='html.parser')
+            pdf_entries = html.find_all(attrs={'class': 'search-result'})
+            pdf_titles   = []
+            pdf_authors  = []
+            pdf_urls     = []
+            pdf_categories = []
+            folder_names = []
+            pdf_citation = []
+            pdf_years = []
+            for i, pdf in enumerate(pdf_entries):
+                pdf_titles.append(pdf.find('span', attrs={'class': 'highwire-cite-title'}).text.strip())
+                pdf_authors.append(pdf.find('span', attrs={'class': 'highwire-citation-authors'}).text.strip().split(', '))
+                pdf_url = pdf.find('a', href=True)['href']
+                if pdf_url[:4] != 'http':
+                    pdf_url = f'http://www.biorxiv.org'+ pdf_url
+                pdf_urls.append(pdf_url)
+                pdf_categories.append(pdf.find('span', attrs={'class': 'highwire-cite-metadata-journal highwire-cite-metadata'}).text.strip())
+                folder_names.append(self.folder_name)
+                pdf_years.append(pdf.find('span', attrs={'class': 'highwire-cite-metadata-pages highwire-cite-metadata'}).text.strip()[:4])
+                pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. {pdf_categories[i]} ({pdf_years[i]}), (available at {pdf_urls[i]}).")
+            pdf_info = list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
+            self.all_pdf_info.append(pdf_info)
+        self.all_pdf_info = [item for sublist in self.all_pdf_info for item in sublist]
+        print(self.all_pdf_info)
+        return self.all_pdf_info
+    def download_pdf(self):
+        # if len(os.listdir(f'./{folder_name}') ) != 0:
+                # check folder is empty to avoid using papers from old runs:
+                # os.remove(f'./{folder_name}/*')
+        # print(pdf_info)
+        all_reference_text = []
+        for i,p in enumerate(stqdm(self.all_pdf_info, desc='Searching and downloading papers')):
+            pdf_title=p[0]
+            pdf_category=p[3]
+            pdf_url=p[1]
+            if pdf_category in ['medRxiv', 'bioRxiv']:
+                pdf_url += '.full.pdf'
+            pdf_file_name=p[0].replace(':','').replace('/','').replace('.','')
+            folder_name=p[4]
+            pdf_citation=p[5]
+            r = requests.get(pdf_url, allow_redirects=True)
+            if  i == 0:
+                if not os.path.exists(f'{folder_name}'):
+                    os.makedirs(f"{folder_name}")
+                else:
+                    shutil.rmtree(f'{folder_name}')
+                    os.makedirs(f"{folder_name}")
+            with open(f'{folder_name}/{pdf_file_name}.pdf', 'wb') as f:
+                f.write(r.content)
+            if i == 0:
+                st.markdown("###### Papers found:")
+            st.markdown(f"{i+1}. {pdf_citation}")
+            time.sleep(0.15)
+            all_reference_text.append(f"{i+1}. {pdf_citation}\n")
+        if 'all_reference_text' not in st.session_state:
+            st.session_state.key = 'all_reference_text'
+        st.session_state['all_reference_text'] = ' '.join(all_reference_text)
 def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_results='10', folder_name='docs'):
     '''
       Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
     '''
     # Remove space in seach query
+    search_query=search_query.strip().replace(" ", "+").replace(", ","+").replace(",","+")
     # Call arXiv API
     arXiv_url=f'http://export.arxiv.org/api/query?search_query={search_by}:{search_query}&sortBy={sort_by}&start=0&max_results={max_results}'
     with urllib.request.urlopen(arXiv_url) as url:
         s = url.read()
+    from lxml import html
     # Parse the xml data
     root = html.fromstring(s)
     if 'all_reference_text' not in st.session_state:
         st.session_state.key = 'all_reference_text'
     st.session_state['all_reference_text'] = ' '.join(all_reference_text)
+def call_bioArXiv_API(search_query, search_by='all', sort_by='relevance', max_results='10', folder_name='docs'):
+    '''
+      Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
+      <li class="first last odd search-result result-jcode-medrxiv search-result-highwire-citation">
+      <div class="highwire-article-citation highwire-citation-type-highwire-article node" data-apath="/medrxiv/early/2021/02/18/2021.02.12.21251663.atom" data-pisa="medrxiv;2021.02.12.21251663v1" data-pisa-master="medrxiv;2021.02.12.21251663" id="node-medrxivearly202102182021021221251663atom1512875027"><div class="highwire-cite highwire-cite-highwire-article highwire-citation-biorxiv-article-pap-list clearfix">
+      <span class="highwire-cite-title">
+      <a class="highwire-cite-linked-title" data-hide-link-title="0" data-icon-position="" href="http://medrxiv.org/content/early/2021/02/18/2021.02.12.21251663">
+      <span class="highwire-cite-title">ClinGen Variant Curation Interface: A Variant Classification Platform for the Application of Evidence Criteria from ACMG/AMP Guidelines</span></a> </span>
+      <div class="highwire-cite-authors"><span class="highwire-citation-authors">
+      <span class="highwire-citation-author first" data-delta="0"><span class="nlm-given-names">Christine G.</span> <span class="nlm-surname">Preston</span></span>,
+      <span class="highwire-citation-author" data-delta="1"><span class="nlm-given-names">Matt W.</span> <span class="nlm-surname">Wright</span></span>,
+      <span class="highwire-citation-author" data-delta="2"><span class="nlm-given-names">Rao</span> <span class="nlm-surname">Madhavrao</span></span>,
+      <div class="highwire-cite-metadata"><span class="highwire-cite-metadata-journal highwire-cite-metadata">medRxiv </span>
+      <span class="highwire-cite-metadata-pages highwire-cite-metadata">2021.02.12.21251663; </span><span class="highwire-cite-metadata-doi highwire-cite-metadata">
+      <span class="doi_label">doi:</span> https://doi.org/10.1101/2021.02.12.21251663 </span></div>
+      <div class="highwire-cite-extras"><div class="hw-make-citation" data-encoded-apath=";medrxiv;early;2021;02;18;2021.02.12.21251663.atom" data-seqnum="0" id="hw-make-citation-0">
+      <a class="link-save-citation-save use-ajax hw-link-save-unsave-catation link-icon" href="/highwire-save-citation/saveapath/%3Bmedrxiv%3Bearly%3B2021%3B02%3B18%3B2021.02.12.21251663.atom/nojs/0" id="link-save-citation-toggle-0" title="Save">
+      <span class="icon-plus"></span> <span class="title">Add to Selected Citations</span></a></div></div>
+      </div>
+      </div></li>
+      </entry>\n
+    '''
+    # Remove space in seach query
+    search_query=search_query.strip().replace(", ", "+").replace(" ", "+").replace(",", "+").split('+')
+    subject_str = ('%20').join(search_query[0].split())
+    for subject in search_query[1:]:
+      subject_str = subject_str + '%252B' + ('%20').join(subject.split())
+    # print(subject_str)
+    # Call arXiv API
+    # bio_arXiv_url=f'http://export.arxiv.org/api/query?search_query={search_by}:{search_query}&sortBy={sort_by}&start=0&max_results={max_results}'
+    # "https://api.biorxiv.org"
+    current_dateTime = datetime.now()
+    today = str(current_dateTime)[:10]
+    journal = 'biorxiv'
+    # journals_str = '%20jcode%3Amedrxiv%7C%7Cbiorxiv'
+    bio_arXiv_url = f'https://www.biorxiv.org/search/'
+    # kwd_str = 'abstract_title%3A' + ('%252C%2B').join([search_query[0]] + [('%2B').join(keyword.split()) for keyword in search_query[1:]])
+    # print(kwd_str)
+    # kwd_str = kwd_str + '%20abstract_title_flags%3Amatch-' + 'all'
+    # bio_arXiv_url += '%20' + kwd_str
+    launch_dates = {"biorxiv": "2013-01-01", "medrxiv": "2019-06-01"}
+    both = False
+    bio_only = True
+    med_only = False
+    if bio_only:
+      print('https://www.biorxiv.org/search/serverless%252Bcomputing%252Bbioinformatics%20jcode%3Abiorxiv%20limit_from%3A2021-06-13%20limit_to%3A2023-02-17%20numresults%3A25%20sort%3Arelevance-rank%20format_result%3Astandard\n bio_only')
+      journal = 'biorxiv'
+      journals_str = f'%20jcode%3A{journal}'
+    if both:
+      # print('https://www.biorxiv.org/search/serverless%252Bcomputing%252Bbioinformatics%20jcode%3Amedrxiv%7C%7Cbiorxiv%20limit_from%3A2022-11-06%20limit_to%3A2023-02-17%20numresults%3A10%20sort%3Arelevance-rank%20format_result%3Astandard\n both')
+      journal = 'biorxiv'
+      journals_str = f'%20jcode%3A{journal}%7C%7Cmedrxiv'
+    if med_only:
+      # print('https://www.biorxiv.org/search/serverless%252Bcomputing%252Bbioinformatics%20jcode%3Amedrxiv%20limit_from%3A2021-06-13%20limit_to%3A2023-02-17%20numresults%3A10%20sort%3Arelevance-rank%20format_result%3Astandard\n med_only')
+      journal = 'medrxiv'
+      journals_str = f'%20jcode%3A{journal}'
+    start_day = launch_dates[journal]
+    bio_arXiv_url += subject_str + journals_str + f'%20limit_from%3A2{start_day}%20limit_to%3A{today}%20numresults%3A{max_results}%20sort%3Arelevance-rank%20format_result%3Astandard'
+    # print(bio_arXiv_url)
+    url_response = requests.post(bio_arXiv_url)
+    html = bs(url_response.text, features='html.parser')
+    pdf_entries = html.find_all(attrs={'class': 'search-result'})
+    # print(articles)
+    # with urllib.request.urlopen(bio_arXiv_url) as url:
+    #     s = url.read()
+    # # Parse the xml data
+    # root = html.fromstring(s)
+    # # Fetch relevant pdf information
+    # pdf_entries = root.xpath("entry")
+    # print(pdf_entries)
+    pdf_titles   = []
+    pdf_authors  = []
+    pdf_urls     = []
+    pdf_categories = []
+    folder_names = []
+    pdf_citation = []
+    pdf_years = []
+    for i, pdf in enumerate(pdf_entries):
+      # print(pdf.xpath('updated/text()')[0][:4])
+      # xpath return a list with every ocurrence of the html path. Since we're getting each entry individually, we'll take the first element to avoid an unecessary list
+      # print(pdf)
+      # [article.find('span', attrs={'class': 'highwire-cite-title'}).text.strip() if article.find('span', attrs={'class': 'highwire-cite-title'}) is not None else None for article in articles]
+      pdf_titles.append(pdf.find('span', attrs={'class': 'highwire-cite-title'}).text.strip())
+      # print(pdf.find('span', attrs={'class': 'highwire-citation-authors'}).text.strip())
+      pdf_authors.append(pdf.find('span', attrs={'class': 'highwire-citation-authors'}).text.strip().split(', '))
+      # print(pdf_authors)
+      # print(f'http://www.{journal}.org')
+      pdf_url = pdf.find('a', href=True)['href']
+      if pdf_url[:4] != 'http':
+        pdf_url = f'http://www.biorxiv.org'+ pdf_url
+      pdf_urls.append(pdf_url)
+      pdf_categories.append(pdf.find('span', attrs={'class': 'highwire-cite-metadata-journal highwire-cite-metadata'}).text.strip())
+      # print(pdf_categories)
+      folder_names.append(folder_name)
+      pdf_years.append(pdf.find('span', attrs={'class': 'highwire-cite-metadata-pages highwire-cite-metadata'}).text.strip()[:4])
+      pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. {pdf_categories[i]} ({pdf_years[i]}), (available at {pdf_urls[i]}).")
+      # print(pdf_citation)
+      # break
+    pdf_info=list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
+    # Check number of available files
+    print('Requesting {max_results} files'.format(max_results=max_results))
+    if len(pdf_urls)<int(max_results):
+        matching_pdf_num=len(pdf_urls)
+        print('Only {matching_pdf_num} files available'.format(matching_pdf_num=matching_pdf_num))
+    return pdf_info
+import urllib.request as urllib2
+def download_bio_pdf(pdf_info):
+  for p in tqdm(pdf_info):
+    pdf_title=p[0].replace(':','').replace('/','-').replace('.','')
+    pdf_url=p[1] + '.full.pdf'
+    # print(pdf_url)
+    pdf_author=p[2]
+    pdf_category=p[3]
+    print(pdf_category)
+    folder_name=p[4]
+    pdf_citation=p[5]
+    r = requests.get(pdf_url, allow_redirects=True)
+    # print(r)
+    print(pdf_url)
+    # r = requests.get(pdf_url, stream=True)
+    if not os.path.exists(folder_name):
+      os.makedirs(f"{folder_name}")
+    with open(f'{folder_name}/{pdf_title}.pdf', 'wb') as f:
+      f.write(r.content)