from gpt_index import GPTListIndex, SimpleWebPageReader, BeautifulSoupWebReader, GPTSimpleVectorIndex,LLMPredictor from IPython.display import Markdown, display from langchain.agents import load_tools, Tool, initialize_agent from langchain.llms import OpenAI from langchain.agents import ZeroShotAgent, Tool, AgentExecutor from langchain.agents import initialize_agent, Tool from langchain import LLMChain from langchain import PromptTemplate import gradio as gr import pandas as pd import openai from sklearn.manifold import TSNE from sklearn.cluster import KMeans from openai.embeddings_utils import get_embedding import numpy as np import matplotlib.pyplot as plt import matplotlib import datetime from datetime import datetime, date, time, timedelta with open('lastradartext.txt', 'r') as file: data_old = file.read() value1,value2,value3,value4,value5,value6=data_old.split('SEPERATOR') def getstuff(openapikey): mainlistofanswers=[] for each in ['www.mckinsey.com','www.bcg.com','www.bain.com','www.accenture.com']: print(each) Input_URL = "https://"+each documents = SimpleWebPageReader(html_to_text=True).load_data([Input_URL]) index = GPTSimpleVectorIndex(documents) print('Came here 0') #@title # Creating your Langchain Agent def querying_db(query: str): response = index.query(query) return response tools = [ Tool( name = "QueryingDB", func=querying_db, description="This function takes a query string as input and returns the most relevant answer from the documentation as output" )] llm = OpenAI(temperature=0,openai_api_key=openapikey) print('Came here 1') query_string = "what are the top technologies mentioned?" agent = initialize_agent(tools, llm, agent="zero-shot-react-description", verbose=True) result = agent.run(query_string) mainlistofanswers.append(result) print('Came here 2') newlistoftech=[] newlistofcompanies=[] for i in range(len(mainlistofanswers)): each=mainlistofanswers[i] each=each.replace("The top technologies mentioned are ","").replace("The technologies mentioned are ","") each=each.replace(":","").replace(" and "," ").replace("and "," ").replace(" and"," ").replace(" the "," ").replace("the "," ").replace(" the"," ").strip() for item in each.split(","): newlistoftech.append(item.strip()) newlistofcompanies.append(i) tech_df=pd.DataFrame() tech_df['tech']=newlistoftech tech_df['company']=newlistofcompanies print('Came here 3') embedding_model = "text-embedding-ada-002" embedding_encoding = "cl100k_base" # this the encoding for text-embedding-ada-002 max_tokens = 8000 # the maximum for text-embedding-ada-002 is 8191 tech_df["embedding"] = tech_df['tech'].apply(lambda x: get_embedding(x, engine=embedding_model)) print('Came here 4') dateforfilesave=datetime.today().strftime("%d-%m-%Y") # Load the embeddings # Convert to a list of lists of floats matrix = np.array(tech_df['embedding'].to_list()) # Create a t-SNE model and transform the data tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200) vis_dims = tsne.fit_transform(matrix) n_clusters = 5 kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42) kmeans.fit(matrix) labels = kmeans.labels_ tech_df["Cluster"] = labels print('Came here 5') colors = ["red", "darkorange", "darkgrey", "blue", "darkgreen"] x = [x for x,y in vis_dims] y = [y for x,y in vis_dims] color_indices = tech_df['Cluster'].values colormap = matplotlib.colors.ListedColormap(colors) #plt.scatter(x, y, c=color_indices, cmap=colormap, alpha=0.3,) fig, ax = plt.subplots(figsize=(12,8)) ax.scatter(x, y, c=color_indices, cmap=colormap, alpha=1, s=100) for i, txt in enumerate(tech_df['tech'].tolist()): ax.annotate(txt, (x[i], y[i]),fontsize=14) plt.title("Top Technologies as of "+dateforfilesave,fontsize=20) plt.axis('off') plt.savefig('lasttechradar.png', bbox_inches='tight') print('Came here 6') response = openai.Completion.create( engine="text-davinci-003", prompt=f'I will give you top technologies list. Write a paragraph on it.\n\nTechnologies:'+",".join(tech_df['tech'].tolist()), temperature=0, max_tokens=1024, top_p=1, frequency_penalty=0, presence_penalty=0, ) print(response["choices"][0]["text"].replace("\n", "")) desc_tmp=response["choices"][0]["text"].replace("\n", "") print('Came here 7') # Reading a review which belong to each group. rev_per_cluster = 5 clusterstextlist=[] for i in range(n_clusters): print(f"Cluster {i} Theme:", end=" ") reviews = "\n".join(tech_df[tech_df['Cluster'] == i]['tech'].tolist()) response = openai.Completion.create( engine="text-davinci-003", prompt=f'What do the following technologies have in common?\n\nCustomer reviews:\n"""\n{reviews}\n"""\n\nTheme:', temperature=0, max_tokens=64, top_p=1, frequency_penalty=0, presence_penalty=0, ) print(response["choices"][0]["text"].replace("\n", "")) print(reviews) clusterstextlist.append("Cluster "+str(i)+"\nTheme:"+response["choices"][0]["text"].replace("\n", "")+'\n'+reviews+'\n'+"-" * 10+'\n\n') textlist=[mainlistofanswers[0],"SEPERATOR",mainlistofanswers[1],"SEPERATOR",mainlistofanswers[2],"SEPERATOR",mainlistofanswers[3],"SEPERATOR",desc_tmp,"SEPERATOR","".join(clusterstextlist)] with open('lastradartext.txt', 'w') as f: for line in textlist: f.write(f"{line}\n") print('Came here 8') with open('lastradartext.txt', 'r') as file: data_old = file.read() value1,value2,value3,value4,value5,value6=data_old.split('SEPERATOR') return 'lasttechradar.png',mainlistofanswers[0],mainlistofanswers[1],mainlistofanswers[2],mainlistofanswers[3],desc_tmp,"".join(clusterstextlist) with gr.Blocks() as demo: gr.Markdown("