Spaces:
Running
Running
| import os | |
| import datetime | |
| import faiss | |
| import streamlit as st | |
| import feedparser | |
| import urllib | |
| import cloudpickle as cp | |
| import pickle | |
| from urllib.request import urlopen | |
| from summa import summarizer | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import requests | |
| import json | |
| from scipy import ndimage | |
| from langchain_openai import AzureOpenAIEmbeddings | |
| from langchain.llms import OpenAI | |
| from langchain_openai import AzureChatOpenAI | |
| os.environ["OPENAI_API_TYPE"] = "azure" | |
| os.environ["AZURE_ENDPOINT"] = st.secrets["endpoint1"] | |
| os.environ["OPENAI_API_KEY"] = st.secrets["key1"] | |
| os.environ["OPENAI_API_VERSION"] = "2023-05-15" | |
| embeddings = AzureOpenAIEmbeddings( | |
| deployment="embedding", | |
| model="text-embedding-ada-002", | |
| azure_endpoint=st.secrets["endpoint1"], | |
| ) | |
| llm = AzureChatOpenAI( | |
| deployment_name="gpt4_small", | |
| openai_api_version="2023-12-01-preview", | |
| azure_endpoint=st.secrets["endpoint2"], | |
| openai_api_key=st.secrets["key2"], | |
| openai_api_type="azure", | |
| temperature=0. | |
| ) | |
| def get_feeds_data(url): | |
| # data = cp.load(urlopen(url)) | |
| with open(url, "rb") as fp: | |
| data = pickle.load(fp) | |
| st.sidebar.success("Loaded data") | |
| return data | |
| # feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_" | |
| # embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0" | |
| dateval = "27-Jun-2023" | |
| feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl" | |
| embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl" | |
| gal_feeds = get_feeds_data(feeds_link) | |
| arxiv_ada_embeddings = get_feeds_data(embed_link) | |
| def get_embedding_data(url): | |
| # data = cp.load(urlopen(url)) | |
| with open(url, "rb") as fp: | |
| data = pickle.load(fp) | |
| st.sidebar.success("Fetched data from API!") | |
| return data | |
| # url = "https://drive.google.com/uc?export=download&id=1133tynMwsfdR1wxbkFLhbES3FwDWTPjP" | |
| url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl" | |
| e2d = get_embedding_data(url) | |
| # e2d, _, _, _, _ = get_embedding_data(url) | |
| ctr = -1 | |
| num_chunks = len(gal_feeds) | |
| ctr = -1 | |
| num_chunks = len(gal_feeds) | |
| all_text, all_titles, all_arxivid, all_links, all_authors, all_pubdates, all_old = [], [], [], [], [], [], [] | |
| for nc in range(num_chunks): | |
| for i in range(len(gal_feeds[nc].entries)): | |
| text = gal_feeds[nc].entries[i].summary | |
| text = text.replace('\n', ' ') | |
| text = text.replace('\\', '') | |
| all_text.append(text) | |
| all_titles.append(gal_feeds[nc].entries[i].title) | |
| all_arxivid.append(gal_feeds[nc].entries[i].id.split('/')[-1][0:-2]) | |
| all_links.append(gal_feeds[nc].entries[i].links[1].href) | |
| all_authors.append(gal_feeds[nc].entries[i].authors) | |
| temp = gal_feeds[nc].entries[i].published | |
| datetime_object = datetime.datetime.strptime(temp[0:10]+' '+temp[11:-1], '%Y-%m-%d %H:%M:%S') | |
| all_pubdates.append(datetime_object) | |
| all_old.append((datetime.datetime.now() - datetime_object).days) | |
| def make_time_excess_plot(midage = 0, tolage = 1, onlyolder = False): | |
| bw = 0.05 | |
| sigma = 4.0 | |
| mask = (np.abs(np.array(all_old) - midage*365) < tolage*365) | |
| if onlyolder == True: | |
| mask2 = (np.array(all_old) > midage*365 + tolage*365/2) | |
| a = np.histogram2d(e2d[0:,0][mask2], e2d[0:,1][mask2], bins=(np.arange(0,17,bw)), density=True) | |
| else: | |
| a = np.histogram2d(e2d[0:,0], e2d[0:,1], bins=(np.arange(0,17,bw)), density=True) | |
| b = np.histogram2d(e2d[0:,0][mask], e2d[0:,1][mask], bins=(np.arange(0,17,bw)), density=True) | |
| temp = b[0].T - a[0].T | |
| temp = ndimage.gaussian_filter(temp, sigma, mode='nearest') | |
| vscale = (np.nanpercentile(temp,99.5) - np.nanpercentile(temp,0.5))/2 | |
| fig = plt.figure(figsize=(11,9)) | |
| plt.pcolor(a[1][0:-1] + (a[1][1]-a[1][0])/2, a[2][0:-1] + (a[2][1]-a[2][0])/2, | |
| temp,cmap='bwr', | |
| vmin=-vscale,vmax=vscale); plt.colorbar() | |
| # plt.scatter(e2d[0:,0], e2d[0:,1],s=2,color='k',alpha=0.1) | |
| plt.title('excess research over the last %.1f yrs centered at %.1f yrs' %(tolage, midage)) | |
| plt.axis([0,14,1,15]) | |
| plt.axis('off') | |
| st.pyplot(fig) | |
| return | |
| st.title('Research hotspots') | |
| st.markdown('[Includes papers up to: `'+dateval+'`]') | |
| midage = st.slider('Age', 0., 10., 0.) | |
| tolage = st.slider('Period width', 0., 10., 1.) | |
| st.markdown('Compare the research in a given time period to the full manifold.') | |
| make_time_excess_plot(midage, tolage, onlyolder = False) | |
| st.markdown('Compare the research in a given time period to research older than that.') | |
| make_time_excess_plot(midage, tolage, onlyolder = True) | |