pathfinder_v3

Running

App Files Files Community

kiyer commited on Jan 10, 2024

Commit

0d72411

1 Parent(s): 9a6132f

update qa sources

Browse files

Files changed (4) hide show

.DS_Store +0 -0
local_files/.DS_Store +0 -0
pages/{3_qa_sources.py → 3_qa_sources_v1.py} +0 -0
pages/3_qa_sources_v2.py +437 -0

.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

local_files/.DS_Store CHANGED Viewed

Binary files a/local_files/.DS_Store and b/local_files/.DS_Store differ

pages/{3_qa_sources.py → 3_qa_sources_v1.py} RENAMED Viewed

File without changes

pages/3_qa_sources_v2.py ADDED Viewed

	@@ -0,0 +1,437 @@

+# set the environment variables needed for openai package to know to reach out to azure
+import os
+import datetime
+import faiss
+import streamlit as st
+import feedparser
+import urllib
+import cloudpickle as cp
+import pickle
+from urllib.request import urlopen
+from summa import summarizer
+import numpy as np
+import matplotlib.pyplot as plt
+import requests
+import json
+from langchain.document_loaders import TextLoader
+from langchain.indexes import VectorstoreIndexCreator
+from langchain_openai import AzureOpenAIEmbeddings
+from langchain.llms import OpenAI
+from langchain_openai import AzureChatOpenAI
+from langchain import hub
+from langchain_core.prompts import PromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnableParallel
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import Chroma
+os.environ["OPENAI_API_TYPE"] = "azure"
+os.environ["AZURE_ENDPOINT"] = st.secrets["endpoint1"]
+os.environ["OPENAI_API_KEY"] = st.secrets["key1"]
+os.environ["OPENAI_API_VERSION"] = "2023-05-15"
+embeddings = AzureOpenAIEmbeddings(
+    deployment="embedding",
+    model="text-embedding-ada-002",
+    azure_endpoint=st.secrets["endpoint1"],
+)
+llm = AzureChatOpenAI(
+        deployment_name="gpt4_small",
+        openai_api_version="2023-12-01-preview",
+        azure_endpoint=st.secrets["endpoint2"],
+        openai_api_key=st.secrets["key2"],
+        openai_api_type="azure",
+        temperature=0.
+    )
+@st.cache_data
+def get_feeds_data(url):
+    # data = cp.load(urlopen(url))
+    with open(url, "rb") as fp:
+        data = pickle.load(fp)
+    st.sidebar.success("Loaded data")
+    return data
+# feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_"
+# embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0"
+dateval = "27-Jun-2023"
+feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
+embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
+gal_feeds = get_feeds_data(feeds_link)
+arxiv_ada_embeddings = get_feeds_data(embed_link)
+@st.cache_data
+def get_embedding_data(url):
+    # data = cp.load(urlopen(url))
+    with open(url, "rb") as fp:
+        data = pickle.load(fp)
+    st.sidebar.success("Fetched data from API!")
+    return data
+# url = "https://drive.google.com/uc?export=download&id=1133tynMwsfdR1wxbkFLhbES3FwDWTPjP"
+url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl"
+e2d = get_embedding_data(url)
+# e2d, _, _, _, _ = get_embedding_data(url)
+ctr = -1
+num_chunks = len(gal_feeds)
+all_text, all_titles, all_arxivid, all_links, all_authors = [], [], [], [], []
+for nc in range(num_chunks):
+    for i in range(len(gal_feeds[nc].entries)):
+        text = gal_feeds[nc].entries[i].summary
+        text = text.replace('\n', ' ')
+        text = text.replace('\\', '')
+        all_text.append(text)
+        all_titles.append(gal_feeds[nc].entries[i].title)
+        all_arxivid.append(gal_feeds[nc].entries[i].id.split('/')[-1][0:-2])
+        all_links.append(gal_feeds[nc].entries[i].links[1].href)
+        all_authors.append(gal_feeds[nc].entries[i].authors)
+d = arxiv_ada_embeddings.shape[1]                           # dimension
+nb = arxiv_ada_embeddings.shape[0]                      # database size
+xb = arxiv_ada_embeddings.astype('float32')
+index = faiss.IndexFlatL2(d)
+index.add(xb)
+def run_simple_query(search_query = 'all:sed+fitting', max_results = 10, start = 0, sort_by = 'lastUpdatedDate', sort_order = 'descending'):
+    """
+        Query ArXiv to return search results for a particular query
+        Parameters
+        ----------
+        query: str
+            query term. use prefixes ti, au, abs, co, jr, cat, m, id, all as applicable.
+        max_results: int, default = 10
+            number of results to return. numbers > 1000 generally lead to timeouts
+        start: int, default = 0
+            start index for results reported. use this if you're interested in running chunks.
+        Returns
+        -------
+        feed: dict
+            object containing requested results parsed with feedparser
+        Notes
+        -----
+            add functionality for chunk parsing, as well as storage and retreival
+        """
+    base_url = 'http://export.arxiv.org/api/query?';
+    query = 'search_query=%s&start=%i&max_results=%i&sortBy=%s&sortOrder=%s' % (search_query,
+                                                     start,
+                                                     max_results,sort_by,sort_order)
+    response = urllib.request.urlopen(base_url+query).read()
+    feed = feedparser.parse(response)
+    return feed
+def find_papers_by_author(auth_name):
+    doc_ids = []
+    for doc_id in range(len(all_authors)):
+        for auth_id in range(len(all_authors[doc_id])):
+            if auth_name.lower() in all_authors[doc_id][auth_id]['name'].lower():
+                print('Doc ID: ',doc_id, ' | arXiv: ', all_arxivid[doc_id], '| ', all_titles[doc_id],' | Author entry: ', all_authors[doc_id][auth_id]['name'])
+                doc_ids.append(doc_id)
+    return doc_ids
+def faiss_based_indices(input_vector, nindex=10):
+    xq = input_vector.reshape(-1,1).T.astype('float32')
+    D, I = index.search(xq, nindex)
+    return I[0], D[0]
+def list_similar_papers_v2(model_data,
+                        doc_id = [], input_type = 'doc_id',
+                        show_authors = False, show_summary = False,
+                        return_n = 10):
+    arxiv_ada_embeddings, embeddings, all_titles, all_abstracts, all_authors = model_data
+    if input_type == 'doc_id':
+        print('Doc ID: ',doc_id,', title: ',all_titles[doc_id])
+#         inferred_vector = model.infer_vector(train_corpus[doc_id].words)
+        inferred_vector = arxiv_ada_embeddings[doc_id,0:]
+        start_range = 1
+    elif input_type == 'arxiv_id':
+        print('ArXiv id: ',doc_id)
+        arxiv_query_feed = run_simple_query(search_query='id:'+str(doc_id))
+        if len(arxiv_query_feed.entries) == 0:
+            print('error: arxiv id not found.')
+            return
+        else:
+            print('Title: '+arxiv_query_feed.entries[0].title)
+            inferred_vector = np.array(embeddings.embed_query(arxiv_query_feed.entries[0].summary))
+        start_range = 0
+    elif input_type == 'keywords':
+        inferred_vector = np.array(embeddings.embed_query(doc_id))
+        start_range = 0
+    else:
+        print('unrecognized input type.')
+        return
+    sims, dists = faiss_based_indices(inferred_vector, return_n+2)
+    textstr = ''
+    abstracts_relevant = []
+    fhdrs = []
+    for i in range(start_range,start_range+return_n):
+        abstracts_relevant.append(all_text[sims[i]])
+        fhdr = all_authors[sims[i]][0]['name'].split()[-1] + all_arxivid[sims[i]][0:2] +'_'+ all_arxivid[sims[i]]
+        fhdrs.append(fhdr)
+        textstr = textstr + str(i+1)+'. **'+ all_titles[sims[i]] +'** (Distance: %.2f' %dists[i]+')   \n'
+        textstr = textstr + '**ArXiv:** ['+all_arxivid[sims[i]]+'](https://arxiv.org/abs/'+all_arxivid[sims[i]]+')  \n'
+        if show_authors == True:
+            textstr = textstr + '**Authors:**  '
+            temp = all_authors[sims[i]]
+            for ak in range(len(temp)):
+                if ak < len(temp)-1:
+                    textstr = textstr + temp[ak].name + ', '
+                else:
+                    textstr = textstr + temp[ak].name + '   \n'
+        if show_summary == True:
+            textstr = textstr + '**Summary:**  '
+            text = all_text[sims[i]]
+            text = text.replace('\n', ' ')
+            textstr = textstr + summarizer.summarize(text) + '  \n'
+        if show_authors == True or show_summary == True:
+            textstr = textstr + ' '
+        textstr = textstr + '  \n'
+    return textstr, abstracts_relevant, fhdrs, sims
+def generate_chat_completion(messages, model="gpt-4", temperature=1, max_tokens=None):
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {openai.api_key}",
+    }
+    data = {
+        "model": model,
+        "messages": messages,
+        "temperature": temperature,
+    }
+    if max_tokens is not None:
+        data["max_tokens"] = max_tokens
+    response = requests.post(API_ENDPOINT, headers=headers, data=json.dumps(data))
+    if response.status_code == 200:
+        return response.json()["choices"][0]["message"]["content"]
+    else:
+        raise Exception(f"Error {response.status_code}: {response.text}")
+model_data = [arxiv_ada_embeddings, embeddings, all_titles, all_text, all_authors]
+def format_docs(docs):
+    return "\n\n".join(doc.page_content for doc in docs)
+def get_textstr(i, show_authors=False, show_summary=False):
+    textstr = ''
+    textstr = '**'+ all_titles[i] +'**   \n'
+    textstr = textstr + '**ArXiv:** ['+all_arxivid[i]+'](https://arxiv.org/abs/'+all_arxivid[i]+')  \n'
+    if show_authors == True:
+        textstr = textstr + '**Authors:**  '
+        temp = all_authors[i]
+        for ak in range(len(temp)):
+            if ak < len(temp)-1:
+                textstr = textstr + temp[ak].name + ', '
+            else:
+                textstr = textstr + temp[ak].name + '   \n'
+    if show_summary == True:
+        textstr = textstr + '**Summary:**  '
+        text = all_text[i]
+        text = text.replace('\n', ' ')
+        textstr = textstr + summarizer.summarize(text) + '  \n'
+    if show_authors == True or show_summary == True:
+        textstr = textstr + ' '
+    textstr = textstr + '  \n'
+    return textstr
+def run_rag(query, return_n = 10, show_authors = True, show_summary = True):
+    sims, absts, fhdrs, simids = list_similar_papers_v2(model_data,
+                                  doc_id = query,
+                                  input_type='keywords',
+                                  show_authors = show_authors, show_summary = show_summary,
+                                  return_n = return_n)
+    temp_abst = ''
+    loaders = []
+    for i in range(len(absts)):
+        temp_abst = absts[i]
+        try:
+            text_file = open("absts/"+fhdrs[i]+".txt", "w")
+        except:
+            os.mkdir('absts')
+            text_file = open("absts/"+fhdrs[i]+".txt", "w")
+        n = text_file.write(temp_abst)
+        text_file.close()
+        loader = TextLoader("absts/"+fhdrs[i]+".txt")
+        loaders.append(loader)
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
+    splits = text_splitter.split_documents([loader.load()[0] for loader in loaders])
+    vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
+    retriever = vectorstore.as_retriever()
+    template = """You are an assistant with expertise in astrophysics for question-answering tasks.
+    Use the following pieces of retrieved context from the literature to answer the question.
+    If you don't know the answer, just say that you don't know.
+    Use six sentences maximum and keep the answer concise.
+    {context}
+    Question: {question}
+    Answer:"""
+    custom_rag_prompt = PromptTemplate.from_template(template)
+    rag_chain_from_docs = (
+        RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
+        | custom_rag_prompt
+        | llm
+        | StrOutputParser()
+    )
+    rag_chain_with_source = RunnableParallel(
+        {"context": retriever, "question": RunnablePassthrough()}
+    ).assign(answer=rag_chain_from_docs)
+    rag_answer = rag_chain_with_source.invoke(query)
+    st.markdown('### User query: '+query)
+    st.markdown(rag_answer['answer'])
+    opstr = '#### Primary sources: \n'
+    srcnames = []
+    for i in range(len(rag_answer['context'])):
+        srcnames.append(rag_answer['context'][0].metadata['source'])
+    srcnames = np.unique(srcnames)
+    srcindices = []
+    for i in range(len(srcnames)):
+        temp = srcnames[i].split('_')[1]
+        srcindices.append(int(srcnames[i].split('_')[0].split('/')[1]))
+        if int(temp[-2:]) < 40:
+            temp = temp[0:-2] + ' et al. 20' + temp[-2:]
+        else:
+            temp = temp[0:-2] + ' et al. 19' + temp[-2:]
+        temp = '['+temp+']('+all_links[int(srcnames[i].split('_')[0].split('/')[1])]+')'
+        st.markdown(temp)
+    simids = np.array(srcindices)
+    fig = plt.figure(figsize=(9,9))
+    plt.scatter(e2d[0:,0], e2d[0:,1],s=2)
+    plt.scatter(e2d[simids,0], e2d[simids,1],s=30)
+    plt.scatter(e2d[abs_indices,0], e2d[abs_indices,1],s=100,color='k',marker='d')
+    st.pyplot(fig)
+    st.markdown('\n #### List of relevant papers:')
+    st.markdown(sims)
+    return rag_answer
+def run_query(query, return_n = 3, show_pure_answer = False, show_all_sources = True):
+    show_authors = True
+    show_summary = True
+    sims, absts, fhdrs, simids = list_similar_papers_v2(model_data,
+                                  doc_id = query,
+                                  input_type='keywords',
+                                  show_authors = show_authors, show_summary = show_summary,
+                                  return_n = return_n)
+    temp_abst = ''
+    loaders = []
+    for i in range(len(absts)):
+        temp_abst = absts[i]
+        try:
+            text_file = open("absts/"+fhdrs[i]+".txt", "w")
+        except:
+            os.mkdir('absts')
+            text_file = open("absts/"+fhdrs[i]+".txt", "w")
+        n = text_file.write(temp_abst)
+        text_file.close()
+        loader = TextLoader("absts/"+fhdrs[i]+".txt")
+        loaders.append(loader)
+    lc_index = VectorstoreIndexCreator().from_loaders(loaders)
+    st.markdown('### User query: '+query)
+    if show_pure_answer == True:
+        st.markdown('pure answer:')
+        st.markdown(lc_index.query(query))
+        st.markdown(' ')
+    st.markdown('#### context-based answer from sources:')
+    output = lc_index.query_with_sources(query + ' Let\'s work this out in a step by step way to be sure we have the right answer.' ) #zero-shot in-context prompting from Zhou+22, Kojima+22
+    st.markdown(output['answer'])
+    opstr = '#### Primary sources: \n'
+    st.markdown(opstr)
+#     opstr = ''
+#     for i in range(len(output['sources'])):
+#         opstr = opstr +'\n'+ output['sources'][i]
+    textstr = ''
+    ng = len(output['sources'].split())
+    abs_indices = []
+    for i in range(ng):
+        if i == (ng-1):
+            tempid = output['sources'].split()[i].split('_')[1][0:-4]
+        else:
+            tempid = output['sources'].split()[i].split('_')[1][0:-5]
+        try:
+            abs_index = all_arxivid.index(tempid)
+            abs_indices.append(abs_index)
+            textstr = textstr + str(i+1)+'. **'+ all_titles[abs_index] +'   \n'
+            textstr = textstr + '**ArXiv:** ['+all_arxivid[abs_index]+'](https://arxiv.org/abs/'+all_arxivid[abs_index]+')  \n'
+            textstr = textstr + '**Authors:**  '
+            temp = all_authors[abs_index]
+            for ak in range(4):
+                if ak < len(temp)-1:
+                    textstr = textstr + temp[ak].name + ', '
+                else:
+                    textstr = textstr + temp[ak].name + '   \n'
+            if len(temp) > 3:
+                textstr = textstr + ' et al.    \n'
+            textstr = textstr + '**Summary:**  '
+            text = all_text[abs_index]
+            text = text.replace('\n', ' ')
+            textstr = textstr + summarizer.summarize(text) + '  \n'
+        except:
+            textstr = textstr + output['sources'].split()[i]
+        #         opstr = opstr + '  \n ' + output['sources'].split()[i][6:-5].split('_')[0]
+        #     opstr = opstr + '  \n Arxiv id: ' + output['sources'].split()[i][6:-5].split('_')[1]
+        textstr = textstr + ' '
+        textstr = textstr + '  \n'
+    st.markdown(textstr)
+    fig = plt.figure(figsize=(9,9))
+    plt.scatter(e2d[0:,0], e2d[0:,1],s=2)
+    plt.scatter(e2d[simids,0], e2d[simids,1],s=30)
+    plt.scatter(e2d[abs_indices,0], e2d[abs_indices,1],s=100,color='k',marker='d')
+    st.pyplot(fig)
+    if show_all_sources == True:
+        st.markdown('\n #### Other interesting papers:')
+        st.markdown(sims)
+    return output
+st.title('ArXiv-based question answering')
+st.markdown('[Includes papers up to: `'+dateval+'`]')
+st.markdown('Concise answers for questions using arxiv abstracts + GPT-4. Please use sparingly because it costs me money right now. You might need to wait for a few seconds for the GPT-4 query to return an answer (check top right corner to see if it is still running).')
+query = st.text_input('Your question here:', value="What sersic index does a disk galaxy have?")
+return_n = st.slider('How many papers should I show?', 1, 20, 10)
+sims = run_query(query, return_n = return_n)