pathfinder_v3

Running

App Files Files Community

kiyer commited on Jan 12, 2024

Commit

f28b621

1 Parent(s): 182832e

updates to codebase for embeddings and RAG QA.

Browse files

Files changed (6) hide show

.DS_Store +0 -0
absts/.DS_Store +0 -0
pages/{2_arxiv_embedding.py → 1_arxiv_embedding_explorer.py} +16 -7
pages/{1_paper_search.py → 2_paper_search.py} +0 -0
pages/{3_qa_sources_v2.py → 3_answering_questions.py} +9 -95
pages/{3_qa_sources_v1.py → 4_qa_sources_v1.py} +9 -27

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

absts/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

pages/{2_arxiv_embedding.py → 1_arxiv_embedding_explorer.py} RENAMED Viewed

@@ -74,9 +74,9 @@ def density_estimation(m1, m2, xmin=0, ymin=0, xmax=15, ymax=15):
 st.sidebar.markdown('This is a widget that allows you to look for papers containing specific phrases in the dataset and show it as a heatmap. Enter the phrase of interest, then change the size and opacity of the heatmap as desired to find the high-density regions. Hover over blue points to see the details of individual papers.')
 st.sidebar.markdown('`Note`: (i) if you enter a query that is not in the corpus of abstracts, it will return an error. just enter a different query in that case. (ii) there are some empty tooltips when you hover, these correspond to the underlying hexbins, and can be ignored.')
-st.sidebar.text_input("Search query", key="phrase", value="")
-alpha_value = st.sidebar.slider("Pick the hexbin opacity",0.0,1.0,0.1)
-size_value = st.sidebar.slider("Pick the hexbin size",0.0,2.0,0.2)
 phrase=st.session_state.phrase
@@ -103,10 +103,19 @@ ID: $index
 """
 p = figure(width=700, height=583, tooltips=TOOLTIPS, x_range=(0, 15), y_range=(2.5,15),
-           title="UMAP projection of trained ArXiv corpus | heatmap keyword: "+phrase)
-p.hexbin(embedding[phrase_flags==1,0],embedding[phrase_flags==1,1], size=size_value,
-         palette = np.flip(OrRd[8]), alpha=alpha_value)
 p.circle('x', 'y', size=3, source=source, alpha=0.3)
 st.bokeh_chart(p)

 st.sidebar.markdown('This is a widget that allows you to look for papers containing specific phrases in the dataset and show it as a heatmap. Enter the phrase of interest, then change the size and opacity of the heatmap as desired to find the high-density regions. Hover over blue points to see the details of individual papers.')
 st.sidebar.markdown('`Note`: (i) if you enter a query that is not in the corpus of abstracts, it will return an error. just enter a different query in that case. (ii) there are some empty tooltips when you hover, these correspond to the underlying hexbins, and can be ignored.')
+st.sidebar.text_input("Search query", key="phrase", value="Quenching")
+alpha_value = st.sidebar.slider("Pick the hexbin opacity",0.0,1.0,0.81)
+size_value = st.sidebar.slider("Pick the hexbin gridsize",10,50,20)
 phrase=st.session_state.phrase
 """
 p = figure(width=700, height=583, tooltips=TOOLTIPS, x_range=(0, 15), y_range=(2.5,15),
+           title="UMAP projection of embeddings for the astro-ph.GA corpus"+phrase)
+# p.hexbin(embedding[phrase_flags==1,0],embedding[phrase_flags==1,1], size=size_value,
+#          palette = np.flip(OrRd[8]), alpha=alpha_value)
 p.circle('x', 'y', size=3, source=source, alpha=0.3)
 st.bokeh_chart(p)
+fig = plt.figure(figsize=(10.5,9*0.8328))
+plt.scatter(embedding[0:,0], embedding[0:,1],s=2,alpha=0.1)
+plt.hexbin(embedding[phrase_flags==1,0],embedding[phrase_flags==1,1],
+gridsize=size_value, cmap = 'viridis', alpha=alpha_value,extent=(-1,16,1.5,16),mincnt=10)
+plt.title("UMAP localization of heatmap keyword: "+phrase)
+plt.axis([0,15,2.5,15]);
+clbr = plt.colorbar(); clbr.set_label('# papers')
+plt.axis('off')
+st.pyplot(fig)

pages/{1_paper_search.py → 2_paper_search.py} RENAMED Viewed

File without changes

pages/{3_qa_sources_v2.py → 3_answering_questions.py} RENAMED Viewed

@@ -1,4 +1,3 @@
-# set the environment variables needed for openai package to know to reach out to azure
 import os
 import datetime
 import faiss
@@ -181,7 +180,7 @@ def list_similar_papers_v2(model_data,
     for i in range(start_range,start_range+return_n):
         abstracts_relevant.append(all_text[sims[i]])
-        fhdr = all_authors[sims[i]][0]['name'].split()[-1] + all_arxivid[sims[i]][0:2] +'_'+ all_arxivid[sims[i]]
         fhdrs.append(fhdr)
         textstr = textstr + str(i+1)+'. **'+ all_titles[sims[i]] +'** (Distance: %.2f' %dists[i]+')   \n'
         textstr = textstr + '**ArXiv:** ['+all_arxivid[sims[i]]+'](https://arxiv.org/abs/'+all_arxivid[sims[i]]+')  \n'
@@ -325,7 +324,7 @@ def run_rag(query, return_n = 10, show_authors = True, show_summary = True):
             temp = temp[0:-2] + ' et al. 19' + temp[-2:]
         temp = '['+temp+']('+all_links[int(srcnames[i].split('_')[0].split('/')[1])]+')'
         st.markdown(temp)
-    simids = np.array(srcindices)
     fig = plt.figure(figsize=(9,9))
     plt.scatter(e2d[0:,0], e2d[0:,1],s=2)
@@ -338,100 +337,15 @@ def run_rag(query, return_n = 10, show_authors = True, show_summary = True):
     return rag_answer
-def run_query(query, return_n = 3, show_pure_answer = False, show_all_sources = True):
-    show_authors = True
-    show_summary = True
-    sims, absts, fhdrs, simids = list_similar_papers_v2(model_data,
-                                  doc_id = query,
-                                  input_type='keywords',
-                                  show_authors = show_authors, show_summary = show_summary,
-                                  return_n = return_n)
-    temp_abst = ''
-    loaders = []
-    for i in range(len(absts)):
-        temp_abst = absts[i]
-        try:
-            text_file = open("absts/"+fhdrs[i]+".txt", "w")
-        except:
-            os.mkdir('absts')
-            text_file = open("absts/"+fhdrs[i]+".txt", "w")
-        n = text_file.write(temp_abst)
-        text_file.close()
-        loader = TextLoader("absts/"+fhdrs[i]+".txt")
-        loaders.append(loader)
-    lc_index = VectorstoreIndexCreator().from_loaders(loaders)
-    st.markdown('### User query: '+query)
-    if show_pure_answer == True:
-        st.markdown('pure answer:')
-        st.markdown(lc_index.query(query))
-        st.markdown(' ')
-    st.markdown('#### context-based answer from sources:')
-    output = lc_index.query_with_sources(query + ' Let\'s work this out in a step by step way to be sure we have the right answer.' ) #zero-shot in-context prompting from Zhou+22, Kojima+22
-    st.markdown(output['answer'])
-    opstr = '#### Primary sources: \n'
-    st.markdown(opstr)
-#     opstr = ''
-#     for i in range(len(output['sources'])):
-#         opstr = opstr +'\n'+ output['sources'][i]
-    textstr = ''
-    ng = len(output['sources'].split())
-    abs_indices = []
-    for i in range(ng):
-        if i == (ng-1):
-            tempid = output['sources'].split()[i].split('_')[1][0:-4]
-        else:
-            tempid = output['sources'].split()[i].split('_')[1][0:-5]
-        try:
-            abs_index = all_arxivid.index(tempid)
-            abs_indices.append(abs_index)
-            textstr = textstr + str(i+1)+'. **'+ all_titles[abs_index] +'   \n'
-            textstr = textstr + '**ArXiv:** ['+all_arxivid[abs_index]+'](https://arxiv.org/abs/'+all_arxivid[abs_index]+')  \n'
-            textstr = textstr + '**Authors:**  '
-            temp = all_authors[abs_index]
-            for ak in range(4):
-                if ak < len(temp)-1:
-                    textstr = textstr + temp[ak].name + ', '
-                else:
-                    textstr = textstr + temp[ak].name + '   \n'
-            if len(temp) > 3:
-                textstr = textstr + ' et al.    \n'
-            textstr = textstr + '**Summary:**  '
-            text = all_text[abs_index]
-            text = text.replace('\n', ' ')
-            textstr = textstr + summarizer.summarize(text) + '  \n'
-        except:
-            textstr = textstr + output['sources'].split()[i]
-        #         opstr = opstr + '  \n ' + output['sources'].split()[i][6:-5].split('_')[0]
-        #     opstr = opstr + '  \n Arxiv id: ' + output['sources'].split()[i][6:-5].split('_')[1]
-        textstr = textstr + ' '
-        textstr = textstr + '  \n'
-    st.markdown(textstr)
-    fig = plt.figure(figsize=(9,9))
-    plt.scatter(e2d[0:,0], e2d[0:,1],s=2)
-    plt.scatter(e2d[simids,0], e2d[simids,1],s=30)
-    plt.scatter(e2d[abs_indices,0], e2d[abs_indices,1],s=100,color='k',marker='d')
-    st.pyplot(fig)
-    if show_all_sources == True:
-        st.markdown('\n #### Other interesting papers:')
-        st.markdown(sims)
-    return output
 st.title('ArXiv-based question answering')
 st.markdown('[Includes papers up to: `'+dateval+'`]')
-st.markdown('Concise answers for questions using arxiv abstracts + GPT-4. Please use sparingly because it costs me money right now. You might need to wait for a few seconds for the GPT-4 query to return an answer (check top right corner to see if it is still running).')
-query = st.text_input('Your question here:', value="What sersic index does a disk galaxy have?")
-return_n = st.slider('How many papers should I show?', 1, 20, 10)
-sims = run_query(query, return_n = return_n)

 import os
 import datetime
 import faiss
     for i in range(start_range,start_range+return_n):
         abstracts_relevant.append(all_text[sims[i]])
+        fhdr = str(sims[i])+'_'+all_authors[sims[i]][0]['name'].split()[-1] + all_arxivid[sims[i]][0:2] +'_'+ all_arxivid[sims[i]]
         fhdrs.append(fhdr)
         textstr = textstr + str(i+1)+'. **'+ all_titles[sims[i]] +'** (Distance: %.2f' %dists[i]+')   \n'
         textstr = textstr + '**ArXiv:** ['+all_arxivid[sims[i]]+'](https://arxiv.org/abs/'+all_arxivid[sims[i]]+')  \n'
             temp = temp[0:-2] + ' et al. 19' + temp[-2:]
         temp = '['+temp+']('+all_links[int(srcnames[i].split('_')[0].split('/')[1])]+')'
         st.markdown(temp)
+    abs_indices = np.array(srcindices)
     fig = plt.figure(figsize=(9,9))
     plt.scatter(e2d[0:,0], e2d[0:,1],s=2)
     return rag_answer
 st.title('ArXiv-based question answering')
 st.markdown('[Includes papers up to: `'+dateval+'`]')
+st.markdown('Concise answers for questions using arxiv abstracts + GPT-4. You might need to wait for a few seconds for the GPT-4 query to return an answer (check top right corner to see if it is still running).')
+st.markdown('The answers are followed by relevant source(s) used in the answer, a graph showing which part of the astro-ph.GA manifold it drew the answer from (tightly clustered points generally indicate high quality/consensus answers) followed by a bunch of relevant papers used by the RAG to compose the answer.')
+st.markdown('If this does not satisfactorily answer your question or rambles too much, you can also try the older `qa_sources_v1` page.')
+query = st.text_input('Your question here:',
+value="What causes galaxy quenching at high redshifts?")
+return_n = st.slider('How many papers should I show?', 1, 30, 10)
+sims = run_rag(query, return_n = return_n)

pages/{3_qa_sources_v1.py → 4_qa_sources_v1.py} RENAMED Viewed

@@ -118,7 +118,7 @@ def find_papers_by_author(auth_name):
     return doc_ids
-def faiss_based_indices(input_vector, nindex=10):
     xq = input_vector.reshape(-1,1).T.astype('float32')
     D, I = index.search(xq, nindex)
     return I[0], D[0]
@@ -126,7 +126,7 @@ def faiss_based_indices(input_vector, nindex=10):
 def list_similar_papers_v2(model_data,
                         doc_id = [], input_type = 'doc_id',
                         show_authors = False, show_summary = False,
-                        return_n = 10):
     arxiv_ada_embeddings, embeddings, all_titles, all_abstracts, all_authors = model_data
@@ -152,7 +152,7 @@ def list_similar_papers_v2(model_data,
         print('unrecognized input type.')
         return
-    sims, dists = faiss_based_indices(inferred_vector, return_n+2)
     textstr = ''
     abstracts_relevant = []
     fhdrs = []
@@ -182,30 +182,9 @@ def list_similar_papers_v2(model_data,
         textstr = textstr + '  \n'
     return textstr, abstracts_relevant, fhdrs, sims
-def generate_chat_completion(messages, model="gpt-4", temperature=1, max_tokens=None):
-    headers = {
-        "Content-Type": "application/json",
-        "Authorization": f"Bearer {openai.api_key}",
-    }
-    data = {
-        "model": model,
-        "messages": messages,
-        "temperature": temperature,
-    }
-    if max_tokens is not None:
-        data["max_tokens"] = max_tokens
-    response = requests.post(API_ENDPOINT, headers=headers, data=json.dumps(data))
-    if response.status_code == 200:
-        return response.json()["choices"][0]["message"]["content"]
-    else:
-        raise Exception(f"Error {response.status_code}: {response.text}")
 model_data = [arxiv_ada_embeddings, embeddings, all_titles, all_text, all_authors]
-def run_query(query, return_n = 3, show_pure_answer = False, show_all_sources = True):
     show_authors = True
     show_summary = True
@@ -213,7 +192,7 @@ def run_query(query, return_n = 3, show_pure_answer = False, show_all_sources =
                                   doc_id = query,
                                   input_type='keywords',
                                   show_authors = show_authors, show_summary = show_summary,
-                                  return_n = return_n)
     temp_abst = ''
     loaders = []
@@ -300,5 +279,8 @@ st.markdown('Concise answers for questions using arxiv abstracts + GPT-4. Please
 query = st.text_input('Your question here:', value="What sersic index does a disk galaxy have?")
 return_n = st.slider('How many papers should I show?', 1, 20, 10)
-sims = run_query(query, return_n = return_n)

     return doc_ids
+def faiss_based_indices(input_vector, nindex=10, yrmin = 1990, yrmax = 2024):
     xq = input_vector.reshape(-1,1).T.astype('float32')
     D, I = index.search(xq, nindex)
     return I[0], D[0]
 def list_similar_papers_v2(model_data,
                         doc_id = [], input_type = 'doc_id',
                         show_authors = False, show_summary = False,
+                        return_n = 10, yrmin = 1990, yrmax = 2024):
     arxiv_ada_embeddings, embeddings, all_titles, all_abstracts, all_authors = model_data
         print('unrecognized input type.')
         return
+    sims, dists = faiss_based_indices(inferred_vector, return_n+2, yrmin = 1990, yrmax = 2024)
     textstr = ''
     abstracts_relevant = []
     fhdrs = []
         textstr = textstr + '  \n'
     return textstr, abstracts_relevant, fhdrs, sims
 model_data = [arxiv_ada_embeddings, embeddings, all_titles, all_text, all_authors]
+def run_query(query, return_n = 3, yrmin = 1990, yrmax = 2024, show_pure_answer = False, show_all_sources = True):
     show_authors = True
     show_summary = True
                                   doc_id = query,
                                   input_type='keywords',
                                   show_authors = show_authors, show_summary = show_summary,
+                                  return_n = return_n, yrmin = 1990, yrmax = 2024)
     temp_abst = ''
     loaders = []
 query = st.text_input('Your question here:', value="What sersic index does a disk galaxy have?")
 return_n = st.slider('How many papers should I show?', 1, 20, 10)
+yrmin = st.slider('Min year', 1990,2023, 1990)
+yrmax = st.slider('Max year', 1990, 2024, 2024)
+sims = run_query(query, return_n = return_n, yrmin = yrmin, yrmax = yrmax)