mehradans92 commited on
Commit
e5985c6
·
1 Parent(s): c4550f6

cleaned repo, added utils

Browse files
Files changed (2) hide show
  1. app.py +1 -178
  2. unitls.py +103 -0
app.py CHANGED
@@ -1,13 +1,7 @@
1
  import streamlit as st #Web App
2
- import urllib
3
- from lxml import html
4
- import requests
5
- import re
6
  import os
7
- from stqdm import stqdm
8
- import time
9
- import shutil
10
  from PIL import Image
 
11
 
12
  import pickle
13
  docs = None
@@ -36,118 +30,14 @@ if len(api_key) != 51:
36
  st.warning('Please enter a valid OpenAI API key.', icon="⚠️")
37
 
38
 
39
-
40
- def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_results='10', folder_name='arxiv-dl'):
41
- '''
42
- Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
43
- <entry>\n
44
- <id>http://arxiv.org/abs/2008.04584v2</id>\n
45
- <updated>2021-05-11T12:00:24Z</updated>\n
46
- <published>2020-08-11T08:47:06Z</published>\n
47
- <title>Bayesian Selective Inference: Non-informative Priors</title>\n
48
- <summary> We discuss Bayesian inference for parameters selected using the data. First,\nwe provide a critical analysis of the existing positions in the literature\nregarding the correct Bayesian approach under selection. Second, we propose two\ntypes of non-informative priors for selection models. These priors may be\nemployed to produce a posterior distribution in the absence of prior\ninformation as well as to provide well-calibrated frequentist inference for the\nselected parameter. We test the proposed priors empirically in several\nscenarios.\n</summary>\n
49
- <author>\n <name>Daniel G. Rasines</name>\n </author>\n <author>\n <name>G. Alastair Young</name>\n </author>\n
50
- <arxiv:comment xmlns:arxiv="http://arxiv.org/schemas/atom">24 pages, 7 figures</arxiv:comment>\n
51
- <link href="http://arxiv.org/abs/2008.04584v2" rel="alternate" type="text/html"/>\n
52
- <link title="pdf" href="http://arxiv.org/pdf/2008.04584v2" rel="related" type="application/pdf"/>\n
53
- <arxiv:primary_category xmlns:arxiv="http://arxiv.org/schemas/atom" term="math.ST" scheme="http://arxiv.org/schemas/atom"/>\n
54
- <category term="math.ST" scheme="http://arxiv.org/schemas/atom"/>\n
55
- <category term="stat.TH" scheme="http://arxiv.org/schemas/atom"/>\n
56
- </entry>\n
57
- '''
58
-
59
- # Remove space in seach query
60
- search_query=search_query.strip().replace(" ", "+")
61
- # Call arXiv API
62
- arXiv_url=f'http://export.arxiv.org/api/query?search_query={search_by}:{search_query}&sortBy={sort_by}&start=0&max_results={max_results}'
63
- with urllib.request.urlopen(arXiv_url) as url:
64
- s = url.read()
65
-
66
- # Parse the xml data
67
- root = html.fromstring(s)
68
- # Fetch relevant pdf information
69
- pdf_entries = root.xpath("entry")
70
-
71
- pdf_titles = []
72
- pdf_authors = []
73
- pdf_urls = []
74
- pdf_categories = []
75
- folder_names = []
76
- pdf_citation = []
77
- pdf_years = []
78
-
79
- for i, pdf in enumerate(pdf_entries):
80
- # print(pdf.xpath('updated/text()')[0][:4])
81
- # xpath return a list with every ocurrence of the html path. Since we're getting each entry individually, we'll take the first element to avoid an unecessary list
82
- pdf_titles.append(re.sub('[^a-zA-Z0-9]', ' ', pdf.xpath("title/text()")[0]))
83
- pdf_authors.append(pdf.xpath("author/name/text()"))
84
- pdf_urls.append(pdf.xpath("link[@title='pdf']/@href")[0])
85
- pdf_categories.append(pdf.xpath("category/@term"))
86
- folder_names.append(folder_name)
87
- pdf_years.append(pdf.xpath('updated/text()')[0][:4])
88
- pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. arXiv [{pdf_categories[i][0]}] ({pdf_years[i]}), (available at {pdf_urls[i]}).")
89
-
90
-
91
-
92
- pdf_info=list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
93
-
94
- # Check number of available files
95
- # print('Requesting {max_results} files'.format(max_results=max_results))
96
- if len(pdf_urls)<int(max_results):
97
- matching_pdf_num=len(pdf_urls)
98
- # print('Only {matching_pdf_num} files available'.format(matching_pdf_num=matching_pdf_num))
99
- return pdf_info, pdf_citation
100
-
101
-
102
- def download_pdf(pdf_info):
103
-
104
- # if len(os.listdir(f'./{folder_name}') ) != 0:
105
- # check folder is empty to avoid using papers from old runs:
106
- # os.remove(f'./{folder_name}/*')
107
- all_reference_text = []
108
- for i,p in enumerate(stqdm(pdf_info, desc='Searching and downloading papers')):
109
-
110
- pdf_title=p[0]
111
- pdf_url=p[1]
112
- pdf_author=p[2]
113
- pdf_category=p[3]
114
- folder_name=p[4]
115
- pdf_citation=p[5]
116
- r = requests.get(pdf_url, allow_redirects=True)
117
- if i == 0:
118
- if not os.path.exists(f'{folder_name}'):
119
- os.makedirs(f"{folder_name}")
120
- else:
121
- shutil.rmtree(f'{folder_name}')
122
- os.makedirs(f"{folder_name}")
123
- with open(f'{folder_name}/{pdf_title}.pdf', 'wb') as currP:
124
- currP.write(r.content)
125
- if i == 0:
126
- st.markdown("###### Papers found:")
127
- st.markdown(f"{i+1}. {pdf_citation}")
128
- time.sleep(0.15)
129
- all_reference_text.append(f"{i+1}. {pdf_citation}\n")
130
- if 'all_reference_text' not in st.session_state:
131
- st.session_state.key = 'all_reference_text'
132
- st.session_state['all_reference_text'] = ' '.join(all_reference_text)
133
-
134
- # print(all_reference_text)
135
-
136
-
137
-
138
  max_results_current = 5
139
  max_results = max_results_current
140
- # pdf_info = ''
141
- # pdf_citation = ''
142
  def search_click_callback(search_query, max_results):
143
  global pdf_info, pdf_citation
144
  pdf_info, pdf_citation = call_arXiv_API(f'{search_query}', max_results=max_results)
145
  download_pdf(pdf_info)
146
  return pdf_info
147
 
148
-
149
-
150
-
151
  with st.form(key='columns_in_form', clear_on_submit = False):
152
  c1, c2 = st.columns([8,1])
153
  with c1:
@@ -158,7 +48,6 @@ with st.form(key='columns_in_form', clear_on_submit = False):
158
  max_results = st.text_input("Max papers", value=max_results_current)
159
  max_results_current = max_results_current
160
  searchButton = st.form_submit_button(label = 'Search')
161
- # search_click(search_query, max_results_default)
162
 
163
  if searchButton:
164
  global pdf_info
@@ -166,39 +55,11 @@ if searchButton:
166
  if 'pdf_info' not in st.session_state:
167
  st.session_state.key = 'pdf_info'
168
  st.session_state['pdf_info'] = pdf_info
169
- # print(f'This is PDF info from search:{pdf_info}')
170
-
171
-
172
- # def tokenize_callback():
173
-
174
- # return docs
175
-
176
- # tokenization_form = st.form(key='tokenization-form')
177
- # tokenization_form.markdown(f"Happy with your paper search results? ")
178
- # toknizeButton = tokenization_form.form_submit_button(label = "Yes! Let's tokenize.", on_click=tokenize_callback())
179
- # tokenization_form.markdown("If not, change keywords and search again. [This step costs!](https://openai.com/api/pricing/)")
180
-
181
-
182
-
183
- # submitButton = form.form_submit_button('Submit')
184
- # with st.form(key='tokenization_form', clear_on_submit = False):
185
- # st.markdown(f"Happy with your paper search results? If not, change keywords and search again. [This step costs!](https://openai.com/api/pricing/)")
186
- # # st.text_input("Input search query here:", placeholder='Keywords for most relevant search...'
187
- # # )#search_query, max_results_current))
188
- # toknizeButton = st.form_submit_button(label = "Yes! Let's tokenize.")
189
-
190
- # if toknizeButton:
191
- # tokenize_callback()
192
-
193
- # tokenize_callback()
194
-
195
-
196
 
197
 
198
  def answer_callback(question_query):
199
  import paperqa
200
  global docs
201
- # global pdf_info
202
  progress_text = "Please wait..."
203
  # my_bar = st.progress(0, text = progress_text)
204
  st.info('Please wait...', icon="🔥")
@@ -221,8 +82,6 @@ def answer_callback(question_query):
221
  st.success('Voila!')
222
  return answer.formatted_answer
223
 
224
-
225
-
226
  form = st.form(key='question_form')
227
  question_query = form.text_input("What do you wanna know from these papers?", placeholder='Input questions here...',
228
  value='')
@@ -232,39 +91,3 @@ if submitButton:
232
  with st.expander("Found papers:", expanded=True):
233
  st.write(f"{st.session_state['all_reference_text']}")
234
  st.text_area("Answer:", answer_callback(question_query), height=600)
235
-
236
- # with st.form(key='question_form', clear_on_submit = False):
237
- # question_query = st.text_input("What do you wanna know from these papers?", placeholder='Input questions here')
238
- # # st.text_input("Input search query here:", placeholder='Keywords for most relevant search...'
239
- # # )#search_query, max_results_current))
240
- # submitButton = form.form_submit_button(label = "Submit", on_click=answer_callback(question_query))
241
-
242
-
243
- # Simulation-based inference bayesian model selection
244
-
245
-
246
-
247
-
248
-
249
- # test = "<ul> \
250
- # <li>List item here</li> \
251
- # <li>List item here</li> \
252
- # <li>List item here</li> \
253
- # <li>List item here</li> \
254
- # </ul>"
255
- # test = "'''It was the best of times, it was the worst of times, it was \
256
- # the age of wisdom, it was the age of foolishness, it was \
257
- # the epoch of belief, it was the epoch of incredulity, it \
258
- # was the season of Light, it was the season of Darkness, it\
259
- # was the spring of hope, it was the winter of despair, (...)'''"
260
-
261
- # citation_text = st.text_area('Papers found:',test, height=300) # f'{pdf_citation}'
262
-
263
-
264
- # for i, cite in enumerate(pdf_citation):
265
- # st.markdown(f'{i+1}. {cite}')
266
- # time.sleep(1)
267
-
268
-
269
- # def make_clickable('link',text):
270
- # return f'<a target="_blank" href="{link}">{text}'
 
1
  import streamlit as st #Web App
 
 
 
 
2
  import os
 
 
 
3
  from PIL import Image
4
+ from unitls import *
5
 
6
  import pickle
7
  docs = None
 
30
  st.warning('Please enter a valid OpenAI API key.', icon="⚠️")
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  max_results_current = 5
34
  max_results = max_results_current
 
 
35
  def search_click_callback(search_query, max_results):
36
  global pdf_info, pdf_citation
37
  pdf_info, pdf_citation = call_arXiv_API(f'{search_query}', max_results=max_results)
38
  download_pdf(pdf_info)
39
  return pdf_info
40
 
 
 
 
41
  with st.form(key='columns_in_form', clear_on_submit = False):
42
  c1, c2 = st.columns([8,1])
43
  with c1:
 
48
  max_results = st.text_input("Max papers", value=max_results_current)
49
  max_results_current = max_results_current
50
  searchButton = st.form_submit_button(label = 'Search')
 
51
 
52
  if searchButton:
53
  global pdf_info
 
55
  if 'pdf_info' not in st.session_state:
56
  st.session_state.key = 'pdf_info'
57
  st.session_state['pdf_info'] = pdf_info
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
 
60
  def answer_callback(question_query):
61
  import paperqa
62
  global docs
 
63
  progress_text = "Please wait..."
64
  # my_bar = st.progress(0, text = progress_text)
65
  st.info('Please wait...', icon="🔥")
 
82
  st.success('Voila!')
83
  return answer.formatted_answer
84
 
 
 
85
  form = st.form(key='question_form')
86
  question_query = form.text_input("What do you wanna know from these papers?", placeholder='Input questions here...',
87
  value='')
 
91
  with st.expander("Found papers:", expanded=True):
92
  st.write(f"{st.session_state['all_reference_text']}")
93
  st.text_area("Answer:", answer_callback(question_query), height=600)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
unitls.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import urllib
2
+ from lxml import html
3
+ import streamlit as st
4
+ import requests
5
+ import re
6
+ from stqdm import stqdm
7
+ import os
8
+ import shutil
9
+ import time
10
+
11
+ def call_arXiv_API(search_query, search_by='all', sort_by='relevance', max_results='10', folder_name='arxiv-dl'):
12
+ '''
13
+ Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
14
+ <entry>\n
15
+ <id>http://arxiv.org/abs/2008.04584v2</id>\n
16
+ <updated>2021-05-11T12:00:24Z</updated>\n
17
+ <published>2020-08-11T08:47:06Z</published>\n
18
+ <title>Bayesian Selective Inference: Non-informative Priors</title>\n
19
+ <summary> We discuss Bayesian inference for parameters selected using the data. First,\nwe provide a critical analysis of the existing positions in the literature\nregarding the correct Bayesian approach under selection. Second, we propose two\ntypes of non-informative priors for selection models. These priors may be\nemployed to produce a posterior distribution in the absence of prior\ninformation as well as to provide well-calibrated frequentist inference for the\nselected parameter. We test the proposed priors empirically in several\nscenarios.\n</summary>\n
20
+ <author>\n <name>Daniel G. Rasines</name>\n </author>\n <author>\n <name>G. Alastair Young</name>\n </author>\n
21
+ <arxiv:comment xmlns:arxiv="http://arxiv.org/schemas/atom">24 pages, 7 figures</arxiv:comment>\n
22
+ <link href="http://arxiv.org/abs/2008.04584v2" rel="alternate" type="text/html"/>\n
23
+ <link title="pdf" href="http://arxiv.org/pdf/2008.04584v2" rel="related" type="application/pdf"/>\n
24
+ <arxiv:primary_category xmlns:arxiv="http://arxiv.org/schemas/atom" term="math.ST" scheme="http://arxiv.org/schemas/atom"/>\n
25
+ <category term="math.ST" scheme="http://arxiv.org/schemas/atom"/>\n
26
+ <category term="stat.TH" scheme="http://arxiv.org/schemas/atom"/>\n
27
+ </entry>\n
28
+ '''
29
+
30
+ # Remove space in seach query
31
+ search_query=search_query.strip().replace(" ", "+")
32
+ # Call arXiv API
33
+ arXiv_url=f'http://export.arxiv.org/api/query?search_query={search_by}:{search_query}&sortBy={sort_by}&start=0&max_results={max_results}'
34
+ with urllib.request.urlopen(arXiv_url) as url:
35
+ s = url.read()
36
+
37
+ # Parse the xml data
38
+ root = html.fromstring(s)
39
+ # Fetch relevant pdf information
40
+ pdf_entries = root.xpath("entry")
41
+
42
+ pdf_titles = []
43
+ pdf_authors = []
44
+ pdf_urls = []
45
+ pdf_categories = []
46
+ folder_names = []
47
+ pdf_citation = []
48
+ pdf_years = []
49
+
50
+ for i, pdf in enumerate(pdf_entries):
51
+ # print(pdf.xpath('updated/text()')[0][:4])
52
+ # xpath return a list with every ocurrence of the html path. Since we're getting each entry individually, we'll take the first element to avoid an unecessary list
53
+ pdf_titles.append(re.sub('[^a-zA-Z0-9]', ' ', pdf.xpath("title/text()")[0]))
54
+ pdf_authors.append(pdf.xpath("author/name/text()"))
55
+ pdf_urls.append(pdf.xpath("link[@title='pdf']/@href")[0])
56
+ pdf_categories.append(pdf.xpath("category/@term"))
57
+ folder_names.append(folder_name)
58
+ pdf_years.append(pdf.xpath('updated/text()')[0][:4])
59
+ pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. arXiv [{pdf_categories[i][0]}] ({pdf_years[i]}), (available at {pdf_urls[i]}).")
60
+
61
+
62
+
63
+ pdf_info=list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
64
+
65
+ # Check number of available files
66
+ # print('Requesting {max_results} files'.format(max_results=max_results))
67
+ if len(pdf_urls)<int(max_results):
68
+ matching_pdf_num=len(pdf_urls)
69
+ # print('Only {matching_pdf_num} files available'.format(matching_pdf_num=matching_pdf_num))
70
+ return pdf_info, pdf_citation
71
+
72
+
73
+ def download_pdf(pdf_info):
74
+
75
+ # if len(os.listdir(f'./{folder_name}') ) != 0:
76
+ # check folder is empty to avoid using papers from old runs:
77
+ # os.remove(f'./{folder_name}/*')
78
+ all_reference_text = []
79
+ for i,p in enumerate(stqdm(pdf_info, desc='Searching and downloading papers')):
80
+
81
+ pdf_title=p[0]
82
+ pdf_url=p[1]
83
+ pdf_author=p[2]
84
+ pdf_category=p[3]
85
+ folder_name=p[4]
86
+ pdf_citation=p[5]
87
+ r = requests.get(pdf_url, allow_redirects=True)
88
+ if i == 0:
89
+ if not os.path.exists(f'{folder_name}'):
90
+ os.makedirs(f"{folder_name}")
91
+ else:
92
+ shutil.rmtree(f'{folder_name}')
93
+ os.makedirs(f"{folder_name}")
94
+ with open(f'{folder_name}/{pdf_title}.pdf', 'wb') as currP:
95
+ currP.write(r.content)
96
+ if i == 0:
97
+ st.markdown("###### Papers found:")
98
+ st.markdown(f"{i+1}. {pdf_citation}")
99
+ time.sleep(0.15)
100
+ all_reference_text.append(f"{i+1}. {pdf_citation}\n")
101
+ if 'all_reference_text' not in st.session_state:
102
+ st.session_state.key = 'all_reference_text'
103
+ st.session_state['all_reference_text'] = ' '.join(all_reference_text)