import urllib
from lxml import html
import streamlit as st
import requests
import re
from stqdm import stqdm
import os
import shutil
import time
def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_results='10', folder_name='arxiv-dl'):
'''
Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
\n
http://arxiv.org/abs/2008.04584v2\n
2021-05-11T12:00:24Z\n
2020-08-11T08:47:06Z\n
Bayesian Selective Inference: Non-informative Priors\n
We discuss Bayesian inference for parameters selected using the data. First,\nwe provide a critical analysis of the existing positions in the literature\nregarding the correct Bayesian approach under selection. Second, we propose two\ntypes of non-informative priors for selection models. These priors may be\nemployed to produce a posterior distribution in the absence of prior\ninformation as well as to provide well-calibrated frequentist inference for the\nselected parameter. We test the proposed priors empirically in several\nscenarios.\n\n
\n Daniel G. Rasines\n \n \n G. Alastair Young\n \n
24 pages, 7 figures\n
\n
\n
\n
\n
\n
\n
'''
# Remove space in seach query
search_query=search_query.strip().replace(" ", "+")
# Call arXiv API
arXiv_url=f'http://export.arxiv.org/api/query?search_query={search_by}:{search_query}&sortBy={sort_by}&start=0&max_results={max_results}'
with urllib.request.urlopen(arXiv_url) as url:
s = url.read()
# Parse the xml data
root = html.fromstring(s)
# Fetch relevant pdf information
pdf_entries = root.xpath("entry")
pdf_titles = []
pdf_authors = []
pdf_urls = []
pdf_categories = []
folder_names = []
pdf_citation = []
pdf_years = []
for i, pdf in enumerate(pdf_entries):
# print(pdf.xpath('updated/text()')[0][:4])
# xpath return a list with every ocurrence of the html path. Since we're getting each entry individually, we'll take the first element to avoid an unecessary list
pdf_titles.append(re.sub('[^a-zA-Z0-9]', ' ', pdf.xpath("title/text()")[0]))
pdf_authors.append(pdf.xpath("author/name/text()"))
pdf_urls.append(pdf.xpath("link[@title='pdf']/@href")[0])
pdf_categories.append(pdf.xpath("category/@term"))
folder_names.append(folder_name)
pdf_years.append(pdf.xpath('updated/text()')[0][:4])
pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. arXiv [{pdf_categories[i][0]}] ({pdf_years[i]}), (available at {pdf_urls[i]}).")
pdf_info=list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
# Check number of available files
# print('Requesting {max_results} files'.format(max_results=max_results))
if len(pdf_urls)