File size: 12,803 Bytes
da0018b
 
a280e4d
749a763
 
 
 
da0018b
a280e4d
 
 
749a763
 
 
 
 
45a7d81
 
 
 
 
 
0c14e18
749a763
 
 
 
 
 
 
 
 
 
 
 
 
a280e4d
3a3acc2
 
 
 
 
 
 
 
 
 
 
 
a280e4d
3a3acc2
da0018b
 
 
 
 
 
 
 
6a2ae7a
da0018b
437c715
da0018b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7c5594b
 
 
 
da0018b
 
 
 
 
 
 
 
6a2ae7a
 
 
45a7d81
d54eee9
 
6a2ae7a
 
45a7d81
6a2ae7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a3acc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e54b3e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a2ae7a
 
 
 
 
 
 
 
 
aac3522
3a3acc2
5c7c2df
a30e3b1
5c7c2df
 
a30e3b1
 
5c7c2df
3a3acc2
6a2ae7a
3a3acc2
 
aac3522
 
5c7c2df
 
aac3522
 
 
5c7c2df
 
 
 
 
 
 
 
aac3522
 
3a3acc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a2ae7a
 
 
 
 
 
 
aac3522
6a2ae7a
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
import json

import pandas as pd
import os

# from google.cloud import secretmanager
import openai
import pinecone
import psycopg2
from psycopg2 import extras
import streamlit as st

# gcp_client = secretmanager.SecretManagerServiceClient()
# response = gcp_client.access_secret_version(request={"name": version.name})

def get_variable(name):
    res = os.getenv(name)    
    if not res:
        try:    
            res = st.secrets[name]
        except Exception as e:
            pass  
    return res

OPENAI_API_KEY = get_variable("OPENAI_API_KEY")  # app.pinecone.io
OPENAI_ORGANIZATION_ID = get_variable("OPENAI_ORGANIZATION_ID")

PINECONE_KEY = get_variable("PINECONE_API_KEY")  # app.pinecone.io
PINE_CONE_ENVIRONMENT = get_variable("PINE_CONE_ENVIRONMENT")  # app.pinecone.io

# OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]  # app.pinecone.io
# OPENAI_ORGANIZATION_ID = st.secrets["OPENAI_ORGANIZATION_ID"]

# PINECONE_KEY = st.secrets["PINECONE_API_KEY"]  # app.pinecone.io
# PINE_CONE_ENVIRONMENT = st.secrets["PINE_CONE_ENVIRONMENT"]  # app.pinecone.io


# def create_connection():
#     host = st.secrets["RAIZED_DB_HOST"]
#     user = st.secrets["RAIZED_DB_USER"]
#     pswd = st.secrets["RAIZED_DB_PASSWORD"]   
#     dbname = 'raized-central'
#     return psycopg2.connect(
#             database=dbname,
#             user=user,
#             password=pswd,
#             host=host,
#             port=5432
#         )    

###

## Pinecone

@st.cache_resource
def init_pinecone():
    pinecone.init(api_key=PINECONE_KEY, environment=PINE_CONE_ENVIRONMENT)  # get a free api key from app.pinecone.io
    return pinecone.Index("dompany-description")

   
def index_query(xq, top_k, regions=[], countries=[], index_namespace="websummarized"):
    print(f"Getting companies from countries: {countries} ")
    filters = []
    if len(regions)>0:
        filters.append({'region': {"$in": regions}})
    if len(countries)>0:
        filters.append({'country': {"$in": countries}})
    if len(filters)==1:
        filter = filters[0]
    elif len(filters)>1:
        filter = {"$and": filters} 
    else:
        filter = {}
    #st.write(filter)
    xc = st.session_state.index.query(xq, namespace=index_namespace, top_k=20, filter = filter, include_metadata=True, include_vectors = False)
    #xc = st.session_state.index.query(xq, top_k=top_k, include_metadata=True, include_vectors = True)
    return xc

def search_index(query, top_k, regions, countries, retriever, index_namespace):
    xq = retriever.encode([query]).tolist()
    try:
        xc = index_query(xq, top_k, regions, countries)
    except:
        # force reload
        pinecone.init(api_key=PINECONE_KEY, environment=PINE_CONE_ENVIRONMENT)
        st.session_state.index = pinecone.Index("company-description")
        xc = index_query(xq, top_k, regions, countries, index_namespace)

    results = []
    for match in xc['matches']:
        #answer = reader(question=query, context=match["metadata"]['context'])
        score = match['score']
        # if 'type' in match['metadata'] and match['metadata']['type']!='description-webcontent' and scrape_boost>0:
        #     score = score / scrape_boost
        answer = {'score': score, 'metadata': match['metadata']}
        if match['id'].endswith("_description"):
            answer['id'] = match['id'][:-12]
        elif match['id'].endswith("_webcontent"):
            answer['id'] = match['id'][:-11]
        else:
            answer['id'] = match['id']
        answer["name"] = match["metadata"]['company_name']
        answer["description"] = match["metadata"]['description'] if "description" in match['metadata'] else ""
        data = None
        data = {"Summary": match["metadata"]["summary"]} 
        if 'summary' in match['metadata']:
            try:                
                txt_data = match["metadata"]["summary"]
                if txt_data.startswith('```json') and txt_data.endswith('```'):
                    txt_data = txt_data[7:-3]
                data = json.loads(txt_data)
            except Exception as e:
                pass                
        answer['data'] = data
        
        results.append(answer)
    return results


def check_password():
    """Returns `True` if the user had the correct password."""

    if get_variable('SKIP_PASSWORD')=="True":
        return True

    def password_entered():
        """Checks whether a password entered by the user is correct."""
        if st.session_state["password"] == get_variable("password"):
            st.session_state["password_correct"] = True
            del st.session_state["password"]  # don't store password
        else:
            st.session_state["password_correct"] = False

    if "password_correct" not in st.session_state:
        # First run, show input for password.
        st.text_input(
            "Password", type="password", on_change=password_entered, key="password"
        )
        return False
    elif not st.session_state["password_correct"]:
        # Password not correct, show input + error.
        st.text_input(
            "Password", type="password", on_change=password_entered, key="password"
        )
        st.error("😕 Password incorrect")
        return False
    else:
        # Password correct.
        return True

def get_prompt(title):
    return ""

    # with create_connection() as con:
    #     cur = con.cursor(cursor_factory=extras.NamedTupleCursor)    
    #     cmd = '''
    #     SELECT prompt
    #     FROM public.prompts 
    #     WHERE title = %(title)s
    #     '''
    #     cur.execute(cmd, {'title':title})
    #     res = cur.fetchone()
    #     # Create a DataFrame from the results and column names
    # print(f"Results getting {title}")
    # return res


assistant_instructions = """Start like this: 
Please find here a list of startups that match the criteria you gave me (right now make a list up, later we will retrieve the list in a step before this).

I like you to present a list view with the option to open up a more detailed view per startup including the location of the startup, the founders and the founding year.

Ask the user to select startups that are of interest for them (just indicate the numbers).

Also invite users to think of other criteria that could help them qualify the startups further such as
1) founder and team characteristics:
- serial entrepreneurs in the team
- strong tech capabilities in the team
- female founders or younger / older founders in the team
- founders who graduated from top 100 universities

ask the user if they would like to use those criteria for filtering (with the downside of seeing potentially very few startups) or rather apply it for ranking the companies (with the downside that there will be a lot of companies at the bottom of the list that are not a match at all).

Invite users to name other criteria even if we are currently not able to provide such features. Ideally, they are possible to extract from a company's website or public founder profiles on social media.

Output a json that specifies the filter criteria important to a user with the output variable. 
Also name the ranking criteria and suggest how to combine them to best meet the user's preferences.
"""

# default_prompt = """
# summarize the outcome of this search. The context is a list of company names followed by the company's description and a relevance score to the user query. 
# the report should mention the most important companies and how they compare to each other and contain the following sections:
# 1) Title: query text (summarized if more than 20 tokens)
# 2) Best matches: Naming of the 3 companies from the list that are most similar to the search query:
# - summarize what they are doing
# - name customers and technology if they are mentioned
# - compare them to each other and point out what they do differently or what is their unique selling proposition
# ----"""    

default_prompt = """
You are an assistant and your job is to help the user discover and analyze startups companies. You need to create a report with an analysis of companies relevant to the user's query. 
Use only information from the explicit list of companies provided! Don't teach the user about investments and don't provide general information.
Below is the user query followed by a list of company descriptions that match the user query. 
First advise the user on ways to improve the query followed by the token '-----'
If the user provide instructions, follow them to create a response. If not, create the following report:
The report should mention the most important companies and how they compare to each other and contain the following sections
- summarize what those companies they are doing (up to 20 words per company)
- name customers and technology if they are mentioned
- compare the companies to each other and point out what they do differently or what is their unique selling proposition
----"""    

query_finetune_prompt = """
You are an assistant and your job is to help the user discover and analyze startups companies. 
Below is a User Query. The user is looking for companies that match. You first need to understand what type of startups the user is looking for based on that query. 
Respond with a list of query keywords that will be used to retreive companies that are relevant to the user. If you are not sure, just respond with the user's original query.
"""

summarization_prompt = """
Below is the user query followed by a list of company descriptions that match the user query. Your job is to create a summary report that will help the user find relevant companies.
Use only information from the explicit list of companies provided!
If you don't have enough information in the user query, advise the user on how to improve the query. 
Don't teach the user about investments and don't provide general information.
The report should mention the most important companies and how they compare to each other and contain the following sections
- summarize what those companies they are doing
- name customers and technology if they are mentioned
- compare the companies to each other and point out what they do differently or what is their unique selling proposition
"""

clustering_prompt = """Please create a document with the following headings: 
H2: Recap of your question
H2: Clusters of relevant companies
H3: Name of Cluster 1
* List of the companies
H3: Name of Cluster 2
* List of the companies
H3: Name of Cluster x
* List of the companies
H2: How you could improve your search
As an input you will get 
- a list of 20 startup companies
- the original user query that was used to retrieve those companies via semantic search out of our database.
Detailed instructions for creating the chapters:
H1: Recap of your question
“I understand that you wanted to find companies in the area of (rephrase the “user query”). I researched our startup database to identify matching startup companies. Here is what I found. Happy to help you refine the search - see some suggestions at the end of the document.” 
H1: Clusters of relevant companies
Create clusters of the companies presented by grouping companies together using three main criteria: 
- Prio 1: solve the same problem
- Prio 2: target similar customers
- Prio 3: have the same business model (B2B, B2C, eCommerce & Marketplace, Manufacturing, SaaS, Advertising, Commission, Subscription)
The output should be no more than 5 clusters with the following conditions:
- All companies should be assigned to a cluster.
- Each company should only be part of one cluster and not show up in multiple clusters.
The format of the output should be:
H2: Name of Cluster in bold
One sentence that summarizes what the cluster is about.
List with all the companies in this cluster. Each list item should be structured like this:
* name of the company in bold (URL of the company, country location of the company): short summary summary of what the company does (max 30 tokens)
H1: How you could improve your search
“I hope you have already found some interesting matches. I am happy to let you refine your search. Here are some ideas on how to find matches in relation to your original question around (“user query”):”
* List of ideas on how to refine and improve the search"""




def on_prompt_selected():
    title = st.session_state.advanced_prompts_select
    new_prompt = get_prompt(title)
    if len(new_prompt)>0 and len(new_prompt[0])>0:
        print(f"Got a prompt for title {title}\n {new_prompt[0]}")
        st.session_state.prompt_title_editable = st.session_state.advanced_prompts_select
        st.session_state.advanced_prompt_content = new_prompt[0]
    else:
        print(f"No results for title {st.session_state.advanced_prompts_select}")