Spaces:
Sleeping
Sleeping
File size: 12,803 Bytes
da0018b a280e4d 749a763 da0018b a280e4d 749a763 45a7d81 0c14e18 749a763 a280e4d 3a3acc2 a280e4d 3a3acc2 da0018b 6a2ae7a da0018b 437c715 da0018b 7c5594b da0018b 6a2ae7a 45a7d81 d54eee9 6a2ae7a 45a7d81 6a2ae7a 3a3acc2 e54b3e0 6a2ae7a aac3522 3a3acc2 5c7c2df a30e3b1 5c7c2df a30e3b1 5c7c2df 3a3acc2 6a2ae7a 3a3acc2 aac3522 5c7c2df aac3522 5c7c2df aac3522 3a3acc2 6a2ae7a aac3522 6a2ae7a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 |
import json
import pandas as pd
import os
# from google.cloud import secretmanager
import openai
import pinecone
import psycopg2
from psycopg2 import extras
import streamlit as st
# gcp_client = secretmanager.SecretManagerServiceClient()
# response = gcp_client.access_secret_version(request={"name": version.name})
def get_variable(name):
res = os.getenv(name)
if not res:
try:
res = st.secrets[name]
except Exception as e:
pass
return res
OPENAI_API_KEY = get_variable("OPENAI_API_KEY") # app.pinecone.io
OPENAI_ORGANIZATION_ID = get_variable("OPENAI_ORGANIZATION_ID")
PINECONE_KEY = get_variable("PINECONE_API_KEY") # app.pinecone.io
PINE_CONE_ENVIRONMENT = get_variable("PINE_CONE_ENVIRONMENT") # app.pinecone.io
# OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"] # app.pinecone.io
# OPENAI_ORGANIZATION_ID = st.secrets["OPENAI_ORGANIZATION_ID"]
# PINECONE_KEY = st.secrets["PINECONE_API_KEY"] # app.pinecone.io
# PINE_CONE_ENVIRONMENT = st.secrets["PINE_CONE_ENVIRONMENT"] # app.pinecone.io
# def create_connection():
# host = st.secrets["RAIZED_DB_HOST"]
# user = st.secrets["RAIZED_DB_USER"]
# pswd = st.secrets["RAIZED_DB_PASSWORD"]
# dbname = 'raized-central'
# return psycopg2.connect(
# database=dbname,
# user=user,
# password=pswd,
# host=host,
# port=5432
# )
###
## Pinecone
@st.cache_resource
def init_pinecone():
pinecone.init(api_key=PINECONE_KEY, environment=PINE_CONE_ENVIRONMENT) # get a free api key from app.pinecone.io
return pinecone.Index("dompany-description")
def index_query(xq, top_k, regions=[], countries=[], index_namespace="websummarized"):
print(f"Getting companies from countries: {countries} ")
filters = []
if len(regions)>0:
filters.append({'region': {"$in": regions}})
if len(countries)>0:
filters.append({'country': {"$in": countries}})
if len(filters)==1:
filter = filters[0]
elif len(filters)>1:
filter = {"$and": filters}
else:
filter = {}
#st.write(filter)
xc = st.session_state.index.query(xq, namespace=index_namespace, top_k=20, filter = filter, include_metadata=True, include_vectors = False)
#xc = st.session_state.index.query(xq, top_k=top_k, include_metadata=True, include_vectors = True)
return xc
def search_index(query, top_k, regions, countries, retriever, index_namespace):
xq = retriever.encode([query]).tolist()
try:
xc = index_query(xq, top_k, regions, countries)
except:
# force reload
pinecone.init(api_key=PINECONE_KEY, environment=PINE_CONE_ENVIRONMENT)
st.session_state.index = pinecone.Index("company-description")
xc = index_query(xq, top_k, regions, countries, index_namespace)
results = []
for match in xc['matches']:
#answer = reader(question=query, context=match["metadata"]['context'])
score = match['score']
# if 'type' in match['metadata'] and match['metadata']['type']!='description-webcontent' and scrape_boost>0:
# score = score / scrape_boost
answer = {'score': score, 'metadata': match['metadata']}
if match['id'].endswith("_description"):
answer['id'] = match['id'][:-12]
elif match['id'].endswith("_webcontent"):
answer['id'] = match['id'][:-11]
else:
answer['id'] = match['id']
answer["name"] = match["metadata"]['company_name']
answer["description"] = match["metadata"]['description'] if "description" in match['metadata'] else ""
data = None
data = {"Summary": match["metadata"]["summary"]}
if 'summary' in match['metadata']:
try:
txt_data = match["metadata"]["summary"]
if txt_data.startswith('```json') and txt_data.endswith('```'):
txt_data = txt_data[7:-3]
data = json.loads(txt_data)
except Exception as e:
pass
answer['data'] = data
results.append(answer)
return results
def check_password():
"""Returns `True` if the user had the correct password."""
if get_variable('SKIP_PASSWORD')=="True":
return True
def password_entered():
"""Checks whether a password entered by the user is correct."""
if st.session_state["password"] == get_variable("password"):
st.session_state["password_correct"] = True
del st.session_state["password"] # don't store password
else:
st.session_state["password_correct"] = False
if "password_correct" not in st.session_state:
# First run, show input for password.
st.text_input(
"Password", type="password", on_change=password_entered, key="password"
)
return False
elif not st.session_state["password_correct"]:
# Password not correct, show input + error.
st.text_input(
"Password", type="password", on_change=password_entered, key="password"
)
st.error("😕 Password incorrect")
return False
else:
# Password correct.
return True
def get_prompt(title):
return ""
# with create_connection() as con:
# cur = con.cursor(cursor_factory=extras.NamedTupleCursor)
# cmd = '''
# SELECT prompt
# FROM public.prompts
# WHERE title = %(title)s
# '''
# cur.execute(cmd, {'title':title})
# res = cur.fetchone()
# # Create a DataFrame from the results and column names
# print(f"Results getting {title}")
# return res
assistant_instructions = """Start like this:
Please find here a list of startups that match the criteria you gave me (right now make a list up, later we will retrieve the list in a step before this).
I like you to present a list view with the option to open up a more detailed view per startup including the location of the startup, the founders and the founding year.
Ask the user to select startups that are of interest for them (just indicate the numbers).
Also invite users to think of other criteria that could help them qualify the startups further such as
1) founder and team characteristics:
- serial entrepreneurs in the team
- strong tech capabilities in the team
- female founders or younger / older founders in the team
- founders who graduated from top 100 universities
ask the user if they would like to use those criteria for filtering (with the downside of seeing potentially very few startups) or rather apply it for ranking the companies (with the downside that there will be a lot of companies at the bottom of the list that are not a match at all).
Invite users to name other criteria even if we are currently not able to provide such features. Ideally, they are possible to extract from a company's website or public founder profiles on social media.
Output a json that specifies the filter criteria important to a user with the output variable.
Also name the ranking criteria and suggest how to combine them to best meet the user's preferences.
"""
# default_prompt = """
# summarize the outcome of this search. The context is a list of company names followed by the company's description and a relevance score to the user query.
# the report should mention the most important companies and how they compare to each other and contain the following sections:
# 1) Title: query text (summarized if more than 20 tokens)
# 2) Best matches: Naming of the 3 companies from the list that are most similar to the search query:
# - summarize what they are doing
# - name customers and technology if they are mentioned
# - compare them to each other and point out what they do differently or what is their unique selling proposition
# ----"""
default_prompt = """
You are an assistant and your job is to help the user discover and analyze startups companies. You need to create a report with an analysis of companies relevant to the user's query.
Use only information from the explicit list of companies provided! Don't teach the user about investments and don't provide general information.
Below is the user query followed by a list of company descriptions that match the user query.
First advise the user on ways to improve the query followed by the token '-----'
If the user provide instructions, follow them to create a response. If not, create the following report:
The report should mention the most important companies and how they compare to each other and contain the following sections
- summarize what those companies they are doing (up to 20 words per company)
- name customers and technology if they are mentioned
- compare the companies to each other and point out what they do differently or what is their unique selling proposition
----"""
query_finetune_prompt = """
You are an assistant and your job is to help the user discover and analyze startups companies.
Below is a User Query. The user is looking for companies that match. You first need to understand what type of startups the user is looking for based on that query.
Respond with a list of query keywords that will be used to retreive companies that are relevant to the user. If you are not sure, just respond with the user's original query.
"""
summarization_prompt = """
Below is the user query followed by a list of company descriptions that match the user query. Your job is to create a summary report that will help the user find relevant companies.
Use only information from the explicit list of companies provided!
If you don't have enough information in the user query, advise the user on how to improve the query.
Don't teach the user about investments and don't provide general information.
The report should mention the most important companies and how they compare to each other and contain the following sections
- summarize what those companies they are doing
- name customers and technology if they are mentioned
- compare the companies to each other and point out what they do differently or what is their unique selling proposition
"""
clustering_prompt = """Please create a document with the following headings:
H2: Recap of your question
H2: Clusters of relevant companies
H3: Name of Cluster 1
* List of the companies
H3: Name of Cluster 2
* List of the companies
H3: Name of Cluster x
* List of the companies
H2: How you could improve your search
As an input you will get
- a list of 20 startup companies
- the original user query that was used to retrieve those companies via semantic search out of our database.
Detailed instructions for creating the chapters:
H1: Recap of your question
“I understand that you wanted to find companies in the area of (rephrase the “user query”). I researched our startup database to identify matching startup companies. Here is what I found. Happy to help you refine the search - see some suggestions at the end of the document.”
H1: Clusters of relevant companies
Create clusters of the companies presented by grouping companies together using three main criteria:
- Prio 1: solve the same problem
- Prio 2: target similar customers
- Prio 3: have the same business model (B2B, B2C, eCommerce & Marketplace, Manufacturing, SaaS, Advertising, Commission, Subscription)
The output should be no more than 5 clusters with the following conditions:
- All companies should be assigned to a cluster.
- Each company should only be part of one cluster and not show up in multiple clusters.
The format of the output should be:
H2: Name of Cluster in bold
One sentence that summarizes what the cluster is about.
List with all the companies in this cluster. Each list item should be structured like this:
* name of the company in bold (URL of the company, country location of the company): short summary summary of what the company does (max 30 tokens)
H1: How you could improve your search
“I hope you have already found some interesting matches. I am happy to let you refine your search. Here are some ideas on how to find matches in relation to your original question around (“user query”):”
* List of ideas on how to refine and improve the search"""
def on_prompt_selected():
title = st.session_state.advanced_prompts_select
new_prompt = get_prompt(title)
if len(new_prompt)>0 and len(new_prompt[0])>0:
print(f"Got a prompt for title {title}\n {new_prompt[0]}")
st.session_state.prompt_title_editable = st.session_state.advanced_prompts_select
st.session_state.advanced_prompt_content = new_prompt[0]
else:
print(f"No results for title {st.session_state.advanced_prompts_select}")
|