import urllib from lxml import html import streamlit as st import requests import re from stqdm import stqdm import os import shutil import time def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_results='10', folder_name='arxiv-dl'): ''' Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting: \n http://arxiv.org/abs/2008.04584v2\n 2021-05-11T12:00:24Z\n 2020-08-11T08:47:06Z\n Bayesian Selective Inference: Non-informative Priors\n We discuss Bayesian inference for parameters selected using the data. First,\nwe provide a critical analysis of the existing positions in the literature\nregarding the correct Bayesian approach under selection. Second, we propose two\ntypes of non-informative priors for selection models. These priors may be\nemployed to produce a posterior distribution in the absence of prior\ninformation as well as to provide well-calibrated frequentist inference for the\nselected parameter. We test the proposed priors empirically in several\nscenarios.\n\n \n Daniel G. Rasines\n \n \n G. Alastair Young\n \n 24 pages, 7 figures\n \n \n \n \n \n \n ''' # Remove space in seach query search_query=search_query.strip().replace(" ", "+") # Call arXiv API arXiv_url=f'http://export.arxiv.org/api/query?search_query={search_by}:{search_query}&sortBy={sort_by}&start=0&max_results={max_results}' with urllib.request.urlopen(arXiv_url) as url: s = url.read() # Parse the xml data root = html.fromstring(s) # Fetch relevant pdf information pdf_entries = root.xpath("entry") pdf_titles = [] pdf_authors = [] pdf_urls = [] pdf_categories = [] folder_names = [] pdf_citation = [] pdf_years = [] for i, pdf in enumerate(pdf_entries): # print(pdf.xpath('updated/text()')[0][:4]) # xpath return a list with every ocurrence of the html path. Since we're getting each entry individually, we'll take the first element to avoid an unecessary list pdf_titles.append(re.sub('[^a-zA-Z0-9]', ' ', pdf.xpath("title/text()")[0])) pdf_authors.append(pdf.xpath("author/name/text()")) pdf_urls.append(pdf.xpath("link[@title='pdf']/@href")[0]) pdf_categories.append(pdf.xpath("category/@term")) folder_names.append(folder_name) pdf_years.append(pdf.xpath('updated/text()')[0][:4]) pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. arXiv [{pdf_categories[i][0]}] ({pdf_years[i]}), (available at {pdf_urls[i]}).") pdf_info=list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation)) # Check number of available files # print('Requesting {max_results} files'.format(max_results=max_results)) if len(pdf_urls)