Spaces:

mehradans92
/

decode-elm

Sleeping

App Files Files Community

mehradans92 commited on Feb 19, 2023

Commit

277d92a

1 Parent(s): 6cbbc77

added unittest, cleaned up

Browse files

Files changed (3) hide show

app.py +1 -2
test/test.py +9 -0
utils.py +20 -256

app.py CHANGED Viewed

@@ -43,7 +43,7 @@ def search_click_callback(search_query, max_results, XRxiv_servers=[]):
     return pdf_info
 with st.form(key='columns_in_form', clear_on_submit = False):
-    c1, c2, c3 = st.columns([6, 0.6, 3])
     with c1:
         search_query = st.text_input("Input search query here:", placeholder='Keywords for most relevant search...', value='CFD Modeling'
                                        )#search_query, max_results_current))
@@ -117,7 +117,6 @@ if submitButton:
         st.write(f"{st.session_state['all_reference_text']}")
     with st.spinner('⏳ Please wait...'):
         start = time.time()
-        print(word_count)
         final_answer = answer_callback(question_query, word_count)
         length_answer = len(final_answer)
         st.text_area("Answer:", final_answer, height=max(length_answer//4, 100))

     return pdf_info
 with st.form(key='columns_in_form', clear_on_submit = False):
+    c1, c2, c3 = st.columns([5, 0.8, 4])
     with c1:
         search_query = st.text_input("Input search query here:", placeholder='Keywords for most relevant search...', value='CFD Modeling'
                                        )#search_query, max_results_current))
         st.write(f"{st.session_state['all_reference_text']}")
     with st.spinner('⏳ Please wait...'):
         start = time.time()
         final_answer = answer_callback(question_query, word_count)
         length_answer = len(final_answer)
         st.text_area("Answer:", final_answer, height=max(length_answer//4, 100))

test/test.py CHANGED Viewed

@@ -27,5 +27,14 @@ class Utils(unittest.TestCase):
         self.assertTrue(os.path.exists(dowloaded_dir))
         shutil.rmtree(f'docs/')
 if __name__ == '__main__':
     unittest.main()

         self.assertTrue(os.path.exists(dowloaded_dir))
         shutil.rmtree(f'docs/')
+    def test_distibute_max_papers(self):
+        XRxiv_servers = ['rxiv', 'medrxiv']
+        max_results = 10
+        max_papers_in_server = distibute_max_papers(max_results, XRxiv_servers)
+        self.assertEqual(max_results, np.sum(max_papers_in_server))
+        self.assertEqual(max_papers_in_server[2], 0)
+        self.assertGreater(max_papers_in_server[0],0)
+        self.assertGreater(max_papers_in_server[3],0)
 if __name__ == '__main__':
     unittest.main()

utils.py CHANGED Viewed

@@ -8,7 +8,8 @@ import shutil
 import time
 from bs4 import BeautifulSoup as bs
 from datetime import datetime
-from urllib.parse import quote
 class XRxivQuery:
@@ -24,6 +25,7 @@ class XRxivQuery:
     def call_API(self):
         search_query = self.search_query.strip().replace(" ", "+").split('+')#.replace(", ", "+").replace(",", "+")#.split('+')
         if 'rxiv' in self.XRxiv_servers:
             '''
             Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
@@ -42,13 +44,10 @@ class XRxivQuery:
             <category term="stat.TH" scheme="http://arxiv.org/schemas/atom"/>\n
             </entry>\n
             '''
-            print('Searching Arxiv\n')
             # Call arXiv API
             journal = 'arXiv'
-            # print(" ".join(search_query))
-            # print(self.search_query.strip().replace(", ", "+").replace(" ", "+").replace(",", "+"))
-            arXiv_url=f'http://export.arxiv.org/api/query?search_query={self.search_by}:{"+".join(search_query)}&sortBy={self.sort_by}&start=0&max_results={self.max_results}'
-            # print(arXiv_url)
             with urllib.request.urlopen(arXiv_url) as url:
                 s = url.read()
@@ -72,7 +71,6 @@ class XRxivQuery:
                 folder_names.append(self.folder_name)
                 pdf_years.append(pdf.xpath('updated/text()')[0][:4])
                 pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. {journal} [{pdf_categories[i][0]}] ({pdf_years[i]}), (available at {pdf_urls[i]}).")
-            # self.all_pdf_citation.append(pdf_citation)
             pdf_info = list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
             self.all_pdf_info.append(pdf_info)
@@ -99,13 +97,16 @@ class XRxivQuery:
             </entry>\n
             '''
             if 'biorxiv' in self.XRxiv_servers and 'medrxiv' not in self.XRxiv_servers:
-                print('Searching biorxiv\n')
                 journals_str = f'%20jcode%3Abiorxiv'
             if 'biorxiv' not in self.XRxiv_servers and 'medrxiv' in self.XRxiv_servers:
-                print('Searching medrxiv\n')
                 journals_str = f'%20jcode%3Amedrxiv'
             if 'biorxiv' in self.XRxiv_servers and 'medrxiv' in self.XRxiv_servers:
-                print('Searching both biorxiv and medrxiv\n')
                 journals_str = f'%20jcode%3Abiorxiv%7C%7Cmedrxiv'
             subject_str = ('%20').join(self.search_query[0].split())
@@ -116,7 +117,7 @@ class XRxivQuery:
             today = str(current_dateTime)[:10]
             start_day = '2013-01-01'
             arXiv_url = f'https://www.biorxiv.org/search/'
-            arXiv_url += subject_str + journals_str + f'%20limit_from%3A2{start_day}%20limit_to%3A{today}%20numresults%3A{self.max_results}%20sort%3Arelevance-rank%20format_result%3Astandard'
             url_response = requests.post(arXiv_url)
             html = bs(url_response.text, features='html.parser')
@@ -145,7 +146,6 @@ class XRxivQuery:
             self.all_pdf_info.append(pdf_info)
         self.all_pdf_info = [item for sublist in self.all_pdf_info for item in sublist]
-        print(self.all_pdf_info)
         return self.all_pdf_info
     def download_pdf(self):
@@ -183,247 +183,11 @@ class XRxivQuery:
-def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_results='10', folder_name='docs'):
-    '''
-      Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
-      <entry>\n
-      <id>http://arxiv.org/abs/2008.04584v2</id>\n
-      <updated>2021-05-11T12:00:24Z</updated>\n
-      <published>2020-08-11T08:47:06Z</published>\n
-      <title>Bayesian Selective Inference: Non-informative Priors</title>\n
-      <summary>  We discuss Bayesian inference for parameters selected using the data. First,\nwe provide a critical analysis of the existing positions in the literature\nregarding the correct Bayesian approach under selection. Second, we propose two\ntypes of non-informative priors for selection models. These priors may be\nemployed to produce a posterior distribution in the absence of prior\ninformation as well as to provide well-calibrated frequentist inference for the\nselected parameter. We test the proposed priors empirically in several\nscenarios.\n</summary>\n
-      <author>\n      <name>Daniel G. Rasines</name>\n    </author>\n    <author>\n      <name>G. Alastair Young</name>\n    </author>\n
-      <arxiv:comment xmlns:arxiv="http://arxiv.org/schemas/atom">24 pages, 7 figures</arxiv:comment>\n
-      <link href="http://arxiv.org/abs/2008.04584v2" rel="alternate" type="text/html"/>\n
-      <link title="pdf" href="http://arxiv.org/pdf/2008.04584v2" rel="related" type="application/pdf"/>\n
-      <arxiv:primary_category xmlns:arxiv="http://arxiv.org/schemas/atom" term="math.ST" scheme="http://arxiv.org/schemas/atom"/>\n
-      <category term="math.ST" scheme="http://arxiv.org/schemas/atom"/>\n
-      <category term="stat.TH" scheme="http://arxiv.org/schemas/atom"/>\n
-      </entry>\n
-    '''
-    # Remove space in seach query
-    search_query=search_query.strip().replace(" ", "+").replace(", ","+").replace(",","+")
-    # Call arXiv API
-    arXiv_url=f'http://export.arxiv.org/api/query?search_query={search_by}:{search_query}&sortBy={sort_by}&start=0&max_results={max_results}'
-    with urllib.request.urlopen(arXiv_url) as url:
-        s = url.read()
-    from lxml import html
-    # Parse the xml data
-    root = html.fromstring(s)
-    # Fetch relevant pdf information
-    pdf_entries = root.xpath("entry")
-    pdf_titles   = []
-    pdf_authors  = []
-    pdf_urls     = []
-    pdf_categories = []
-    folder_names = []
-    pdf_citation = []
-    pdf_years = []
-    for i, pdf in enumerate(pdf_entries):
-      pdf_titles.append(re.sub('[^a-zA-Z0-9]', ' ', pdf.xpath("title/text()")[0]))
-      pdf_authors.append(pdf.xpath("author/name/text()"))
-      pdf_urls.append(pdf.xpath("link[@title='pdf']/@href")[0])
-      pdf_categories.append(pdf.xpath("category/@term"))
-      folder_names.append(folder_name)
-      pdf_years.append(pdf.xpath('updated/text()')[0][:4])
-      pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. arXiv [{pdf_categories[i][0]}] ({pdf_years[i]}), (available at {pdf_urls[i]}).")
-    pdf_info=list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
-    # Check number of available files
-    if len(pdf_urls)<int(max_results):
-        matching_pdf_num=len(pdf_urls)
-        # print('Only {matching_pdf_num} files available'.format(matching_pdf_num=matching_pdf_num))
-    return pdf_info, pdf_citation
-def download_pdf(pdf_info):
-    # if len(os.listdir(f'./{folder_name}') ) != 0:
-            # check folder is empty to avoid using papers from old runs:
-            # os.remove(f'./{folder_name}/*')
-    # print(pdf_info)
-    all_reference_text = []
-    for i,p in enumerate(stqdm(pdf_info, desc='Searching and downloading papers')):
-        pdf_title=p[0].replace(':','').replace('/','').replace('.','')
-        pdf_category=p[3]
-        pdf_url=p[1]
-        if pdf_category in ['medRxiv', 'bioRxiv']:
-            pdf_url += '.full.pdf'
-        pdf_author=p[2]
-        folder_name=p[4]
-        pdf_citation=p[5]
-        r = requests.get(pdf_url, allow_redirects=True)
-        if  i == 0:
-            if not os.path.exists(f'{folder_name}'):
-                os.makedirs(f"{folder_name}")
-            else:
-                shutil.rmtree(f'{folder_name}')
-                os.makedirs(f"{folder_name}")
-        with open(f'{folder_name}/{pdf_title}.pdf', 'wb') as f:
-            f.write(r.content)
-        if i == 0:
-            st.markdown("###### Papers found:")
-        st.markdown(f"{i+1}. {pdf_citation}")
-        time.sleep(0.15)
-        all_reference_text.append(f"{i+1}. {pdf_citation}\n")
-    if 'all_reference_text' not in st.session_state:
-        st.session_state.key = 'all_reference_text'
-    st.session_state['all_reference_text'] = ' '.join(all_reference_text)
-def call_bioArXiv_API(search_query, search_by='all', sort_by='relevance', max_results='10', folder_name='docs'):
-    '''
-      Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
-      <li class="first last odd search-result result-jcode-medrxiv search-result-highwire-citation">
-      <div class="highwire-article-citation highwire-citation-type-highwire-article node" data-apath="/medrxiv/early/2021/02/18/2021.02.12.21251663.atom" data-pisa="medrxiv;2021.02.12.21251663v1" data-pisa-master="medrxiv;2021.02.12.21251663" id="node-medrxivearly202102182021021221251663atom1512875027"><div class="highwire-cite highwire-cite-highwire-article highwire-citation-biorxiv-article-pap-list clearfix">
-      <span class="highwire-cite-title">
-      <a class="highwire-cite-linked-title" data-hide-link-title="0" data-icon-position="" href="http://medrxiv.org/content/early/2021/02/18/2021.02.12.21251663">
-      <span class="highwire-cite-title">ClinGen Variant Curation Interface: A Variant Classification Platform for the Application of Evidence Criteria from ACMG/AMP Guidelines</span></a> </span>
-      <div class="highwire-cite-authors"><span class="highwire-citation-authors">
-      <span class="highwire-citation-author first" data-delta="0"><span class="nlm-given-names">Christine G.</span> <span class="nlm-surname">Preston</span></span>,
-      <span class="highwire-citation-author" data-delta="1"><span class="nlm-given-names">Matt W.</span> <span class="nlm-surname">Wright</span></span>,
-      <span class="highwire-citation-author" data-delta="2"><span class="nlm-given-names">Rao</span> <span class="nlm-surname">Madhavrao</span></span>,
-      <div class="highwire-cite-metadata"><span class="highwire-cite-metadata-journal highwire-cite-metadata">medRxiv </span>
-      <span class="highwire-cite-metadata-pages highwire-cite-metadata">2021.02.12.21251663; </span><span class="highwire-cite-metadata-doi highwire-cite-metadata">
-      <span class="doi_label">doi:</span> https://doi.org/10.1101/2021.02.12.21251663 </span></div>
-      <div class="highwire-cite-extras"><div class="hw-make-citation" data-encoded-apath=";medrxiv;early;2021;02;18;2021.02.12.21251663.atom" data-seqnum="0" id="hw-make-citation-0">
-      <a class="link-save-citation-save use-ajax hw-link-save-unsave-catation link-icon" href="/highwire-save-citation/saveapath/%3Bmedrxiv%3Bearly%3B2021%3B02%3B18%3B2021.02.12.21251663.atom/nojs/0" id="link-save-citation-toggle-0" title="Save">
-      <span class="icon-plus"></span> <span class="title">Add to Selected Citations</span></a></div></div>
-      </div>
-      </div></li>
-      </entry>\n
-    '''
-    # Remove space in seach query
-    search_query=search_query.strip().replace(", ", "+").replace(" ", "+").replace(",", "+").split('+')
-    subject_str = ('%20').join(search_query[0].split())
-    for subject in search_query[1:]:
-      subject_str = subject_str + '%252B' + ('%20').join(subject.split())
-    # print(subject_str)
-    # Call arXiv API
-    # bio_arXiv_url=f'http://export.arxiv.org/api/query?search_query={search_by}:{search_query}&sortBy={sort_by}&start=0&max_results={max_results}'
-    # "https://api.biorxiv.org"
-    current_dateTime = datetime.now()
-    today = str(current_dateTime)[:10]
-    journal = 'biorxiv'
-    # journals_str = '%20jcode%3Amedrxiv%7C%7Cbiorxiv'
-    bio_arXiv_url = f'https://www.biorxiv.org/search/'
-    # kwd_str = 'abstract_title%3A' + ('%252C%2B').join([search_query[0]] + [('%2B').join(keyword.split()) for keyword in search_query[1:]])
-    # print(kwd_str)
-    # kwd_str = kwd_str + '%20abstract_title_flags%3Amatch-' + 'all'
-    # bio_arXiv_url += '%20' + kwd_str
-    launch_dates = {"biorxiv": "2013-01-01", "medrxiv": "2019-06-01"}
-    both = False
-    bio_only = True
-    med_only = False
-    if bio_only:
-      print('https://www.biorxiv.org/search/serverless%252Bcomputing%252Bbioinformatics%20jcode%3Abiorxiv%20limit_from%3A2021-06-13%20limit_to%3A2023-02-17%20numresults%3A25%20sort%3Arelevance-rank%20format_result%3Astandard\n bio_only')
-      journal = 'biorxiv'
-      journals_str = f'%20jcode%3A{journal}'
-    if both:
-      # print('https://www.biorxiv.org/search/serverless%252Bcomputing%252Bbioinformatics%20jcode%3Amedrxiv%7C%7Cbiorxiv%20limit_from%3A2022-11-06%20limit_to%3A2023-02-17%20numresults%3A10%20sort%3Arelevance-rank%20format_result%3Astandard\n both')
-      journal = 'biorxiv'
-      journals_str = f'%20jcode%3A{journal}%7C%7Cmedrxiv'
-    if med_only:
-      # print('https://www.biorxiv.org/search/serverless%252Bcomputing%252Bbioinformatics%20jcode%3Amedrxiv%20limit_from%3A2021-06-13%20limit_to%3A2023-02-17%20numresults%3A10%20sort%3Arelevance-rank%20format_result%3Astandard\n med_only')
-      journal = 'medrxiv'
-      journals_str = f'%20jcode%3A{journal}'
-    start_day = launch_dates[journal]
-    bio_arXiv_url += subject_str + journals_str + f'%20limit_from%3A2{start_day}%20limit_to%3A{today}%20numresults%3A{max_results}%20sort%3Arelevance-rank%20format_result%3Astandard'
-    # print(bio_arXiv_url)
-    url_response = requests.post(bio_arXiv_url)
-    html = bs(url_response.text, features='html.parser')
-    pdf_entries = html.find_all(attrs={'class': 'search-result'})
-    # print(articles)
-    # with urllib.request.urlopen(bio_arXiv_url) as url:
-    #     s = url.read()
-    # # Parse the xml data
-    # root = html.fromstring(s)
-    # # Fetch relevant pdf information
-    # pdf_entries = root.xpath("entry")
-    # print(pdf_entries)
-    pdf_titles   = []
-    pdf_authors  = []
-    pdf_urls     = []
-    pdf_categories = []
-    folder_names = []
-    pdf_citation = []
-    pdf_years = []
-    for i, pdf in enumerate(pdf_entries):
-      # print(pdf.xpath('updated/text()')[0][:4])
-      # xpath return a list with every ocurrence of the html path. Since we're getting each entry individually, we'll take the first element to avoid an unecessary list
-      # print(pdf)
-      # [article.find('span', attrs={'class': 'highwire-cite-title'}).text.strip() if article.find('span', attrs={'class': 'highwire-cite-title'}) is not None else None for article in articles]
-      pdf_titles.append(pdf.find('span', attrs={'class': 'highwire-cite-title'}).text.strip())
-      # print(pdf.find('span', attrs={'class': 'highwire-citation-authors'}).text.strip())
-      pdf_authors.append(pdf.find('span', attrs={'class': 'highwire-citation-authors'}).text.strip().split(', '))
-      # print(pdf_authors)
-      # print(f'http://www.{journal}.org')
-      pdf_url = pdf.find('a', href=True)['href']
-      if pdf_url[:4] != 'http':
-        pdf_url = f'http://www.biorxiv.org'+ pdf_url
-      pdf_urls.append(pdf_url)
-      pdf_categories.append(pdf.find('span', attrs={'class': 'highwire-cite-metadata-journal highwire-cite-metadata'}).text.strip())
-      # print(pdf_categories)
-      folder_names.append(folder_name)
-      pdf_years.append(pdf.find('span', attrs={'class': 'highwire-cite-metadata-pages highwire-cite-metadata'}).text.strip()[:4])
-      pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. {pdf_categories[i]} ({pdf_years[i]}), (available at {pdf_urls[i]}).")
-      # print(pdf_citation)
-      # break
-    pdf_info=list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
-    # Check number of available files
-    print('Requesting {max_results} files'.format(max_results=max_results))
-    if len(pdf_urls)<int(max_results):
-        matching_pdf_num=len(pdf_urls)
-        print('Only {matching_pdf_num} files available'.format(matching_pdf_num=matching_pdf_num))
-    return pdf_info
-import urllib.request as urllib2
-def download_bio_pdf(pdf_info):
-  for p in tqdm(pdf_info):
-    pdf_title=p[0].replace(':','').replace('/','-').replace('.','')
-    pdf_url=p[1] + '.full.pdf'
-    # print(pdf_url)
-    pdf_author=p[2]
-    pdf_category=p[3]
-    print(pdf_category)
-    folder_name=p[4]
-    pdf_citation=p[5]
-    r = requests.get(pdf_url, allow_redirects=True)
-    # print(r)
-    print(pdf_url)
-    # r = requests.get(pdf_url, stream=True)
-    if not os.path.exists(folder_name):
-      os.makedirs(f"{folder_name}")
-    with open(f'{folder_name}/{pdf_title}.pdf', 'wb') as f:
-      f.write(r.content)

 import time
 from bs4 import BeautifulSoup as bs
 from datetime import datetime
+from random import uniform as rand
+import numpy as np
 class XRxivQuery:
     def call_API(self):
         search_query = self.search_query.strip().replace(" ", "+").split('+')#.replace(", ", "+").replace(",", "+")#.split('+')
+        max_papers_in_server = distibute_max_papers(self.max_results, self.XRxiv_servers)
         if 'rxiv' in self.XRxiv_servers:
             '''
             Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
             <category term="stat.TH" scheme="http://arxiv.org/schemas/atom"/>\n
             </entry>\n
             '''
             # Call arXiv API
             journal = 'arXiv'
+            max_rxiv_papers = max_papers_in_server[0]
+            arXiv_url=f'http://export.arxiv.org/api/query?search_query={self.search_by}:{"+".join(search_query)}&sortBy={self.sort_by}&start=0&max_results={max_rxiv_papers}'
             with urllib.request.urlopen(arXiv_url) as url:
                 s = url.read()
                 folder_names.append(self.folder_name)
                 pdf_years.append(pdf.xpath('updated/text()')[0][:4])
                 pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. {journal} [{pdf_categories[i][0]}] ({pdf_years[i]}), (available at {pdf_urls[i]}).")
             pdf_info = list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
             self.all_pdf_info.append(pdf_info)
             </entry>\n
             '''
             if 'biorxiv' in self.XRxiv_servers and 'medrxiv' not in self.XRxiv_servers:
+                # print('Searching biorxiv\n')
+                max_biorxiv_papers = max_papers_in_server[2]
                 journals_str = f'%20jcode%3Abiorxiv'
             if 'biorxiv' not in self.XRxiv_servers and 'medrxiv' in self.XRxiv_servers:
+                # print('Searching medrxiv\n')
+                max_biorxiv_papers = max_papers_in_server[3]
                 journals_str = f'%20jcode%3Amedrxiv'
             if 'biorxiv' in self.XRxiv_servers and 'medrxiv' in self.XRxiv_servers:
+                # print('Searching both biorxiv and medrxiv\n')
+                max_biorxiv_papers = max_papers_in_server[3]+ max_papers_in_server[2] # birxiv and medrxiv are together.
                 journals_str = f'%20jcode%3Abiorxiv%7C%7Cmedrxiv'
             subject_str = ('%20').join(self.search_query[0].split())
             today = str(current_dateTime)[:10]
             start_day = '2013-01-01'
             arXiv_url = f'https://www.biorxiv.org/search/'
+            arXiv_url += subject_str + journals_str + f'%20limit_from%3A2{start_day}%20limit_to%3A{today}%20numresults%3A{max_biorxiv_papers}%20sort%3Arelevance-rank%20format_result%3Astandard'
             url_response = requests.post(arXiv_url)
             html = bs(url_response.text, features='html.parser')
             self.all_pdf_info.append(pdf_info)
         self.all_pdf_info = [item for sublist in self.all_pdf_info for item in sublist]
         return self.all_pdf_info
     def download_pdf(self):
+def distibute_max_papers(max_results, XRxiv_servers):
+    fixed_length = len(XRxiv_servers)
+    sample = np.random.multinomial(max_results - fixed_length, np.ones(fixed_length)/fixed_length, size=1)[0] + 1
+    max_papers_in_server = np.zeros(4, dtype=int)
+    all_servers = ['rxiv', 'chemrxiv', 'biorxiv', 'medrxiv']
+    for i,s in enumerate(XRxiv_servers):
+        max_papers_in_server[all_servers.index(s)] = int(sample[i])
+    return  max_papers_in_server