Spaces:

hanchraizedai
/

semsearch

Sleeping

File size: 12,803 Bytes

import json

import pandas as pd
import os

# from google.cloud import secretmanager
import openai
import pinecone
import psycopg2
from psycopg2 import extras
import streamlit as st

# gcp_client = secretmanager.SecretManagerServiceClient()
# response = gcp_client.access_secret_version(request={"name": version.name})

def get_variable(name):
    res = os.getenv(name)    
    if not res:
        try:    
            res = st.secrets[name]
        except Exception as e:
            pass  
    return res

OPENAI_API_KEY = get_variable("OPENAI_API_KEY")  # app.pinecone.io
OPENAI_ORGANIZATION_ID = get_variable("OPENAI_ORGANIZATION_ID")

PINECONE_KEY = get_variable("PINECONE_API_KEY")  # app.pinecone.io
PINE_CONE_ENVIRONMENT = get_variable("PINE_CONE_ENVIRONMENT")  # app.pinecone.io

# OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]  # app.pinecone.io
# OPENAI_ORGANIZATION_ID = st.secrets["OPENAI_ORGANIZATION_ID"]

# PINECONE_KEY = st.secrets["PINECONE_API_KEY"]  # app.pinecone.io
# PINE_CONE_ENVIRONMENT = st.secrets["PINE_CONE_ENVIRONMENT"]  # app.pinecone.io


# def create_connection():
#     host = st.secrets["RAIZED_DB_HOST"]
#     user = st.secrets["RAIZED_DB_USER"]
#     pswd = st.secrets["RAIZED_DB_PASSWORD"]   
#     dbname = 'raized-central'
#     return psycopg2.connect(
#             database=dbname,
#             user=user,
#             password=pswd,
#             host=host,
#             port=5432
#         )    

###

## Pinecone

@st.cache_resource
def init_pinecone():
    pinecone.init(api_key=PINECONE_KEY, environment=PINE_CONE_ENVIRONMENT)  # get a free api key from app.pinecone.io
    return pinecone.Index("dompany-description")

   
def index_query(xq, top_k, regions=[], countries=[], index_namespace="websummarized"):
    print(f"Getting companies from countries: {countries} ")
    filters = []
    if len(regions)>0:
        filters.append({'region': {"$in": regions}})
    if len(countries)>0:
        filters.append({'country': {"$in": countries}})
    if len(filters)==1:
        filter = filters[0]
    elif len(filters)>1:
        filter = {"$and": filters} 
    else:
        filter = {}
    #st.write(filter)
    xc = st.session_state.index.query(xq, namespace=index_namespace, top_k=20, filter = filter, include_metadata=True, include_vectors = False)
    #xc = st.session_state.index.query(xq, top_k=top_k, include_metadata=True, include_vectors = True)
    return xc

def search_index(query, top_k, regions, countries, retriever, index_namespace):
    xq = retriever.encode([query]).tolist()
    try:
        xc = index_query(xq, top_k, regions, countries)
    except:
        # force reload
        pinecone.init(api_key=PINECONE_KEY, environment=PINE_CONE_ENVIRONMENT)
        st.session_state.index = pinecone.Index("company-description")
        xc = index_query(xq, top_k, regions, countries, index_namespace)

    results = []
    for match in xc['matches']:
        #answer = reader(question=query, context=match["metadata"]['context'])
        score = match['score']
        # if 'type' in match['metadata'] and match['metadata']['type']!='description-webcontent' and scrape_boost>0:
        #     score = score / scrape_boost
        answer = {'score': score, 'metadata': match['metadata']}
        if match['id'].endswith("_description"):
            answer['id'] = match['id'][:-12]
        elif match['id'].endswith("_webcontent"):
            answer['id'] = match['id'][:-11]
        else:
            answer['id'] = match['id']
        answer["name"] = match["metadata"]['company_name']
        answer["description"] = match["metadata"]['description'] if "description" in match['metadata'] else ""
        data = None
        data = {"Summary": match["metadata"]["summary"]} 
        if 'summary' in match['metadata']:
            try:                
                txt_data = match["metadata"]["summary"]
                if txt_data.startswith('```json') and txt_data.endswith('```'):
                    txt_data = txt_data[7:-3]
                data = json.loads(txt_data)
            except Exception as e:
                pass                
        answer['data'] = data
        
        results.append(answer)
    return results


def check_password():
    """Returns `True` if the user had the correct password."""

    if get_variable('SKIP_PASSWORD')=="True":
        return True

    def password_entered():
        """Checks whether a password entered by the user is correct."""
        if st.session_state["password"] == get_variable("password"):
            st.session_state["password_correct"] = True
            del st.session_state["password"]  # don't store password
        else:
            st.session_state["password_correct"] = False

    if "password_correct" not in st.session_state:
        # First run, show input for password.
        st.text_input(
            "Password", type="password", on_change=password_entered, key="password"
        )
        return False
    elif not st.session_state["password_correct"]:
        # Password not correct, show input + error.
        st.text_input(
            "Password", type="password", on_change=password_entered, key="password"
        )
        st.error("😕 Password incorrect")
        return False
    else:
        # Password correct.
        return True

def get_prompt(title):
    return ""

    # with create_connection() as con:
    #     cur = con.cursor(cursor_factory=extras.NamedTupleCursor)    
    #     cmd = '''
    #     SELECT prompt
    #     FROM public.prompts 
    #     WHERE title = %(title)s
    #     '''
    #     cur.execute(cmd, {'title':title})
    #     res = cur.fetchone()
    #     # Create a DataFrame from the results and column names
    # print(f"Results getting {title}")
    # return res


assistant_instructions = """Start like this: 
Please find here a list of startups that match the criteria you gave me (right now make a list up, later we will retrieve the list in a step before this).

I like you to present a list view with the option to open up a more detailed view per startup including the location of the startup, the founders and the founding year.

Ask the user to select startups that are of interest for them (just indicate the numbers).

Also invite users to think of other criteria that could help them qualify the startups further such as
1) founder and team characteristics:
- serial entrepreneurs in the team
- strong tech capabilities in the team
- female founders or younger / older founders in the team
- founders who graduated from top 100 universities

ask the user if they would like to use those criteria for filtering (with the downside of seeing potentially very few startups) or rather apply it for ranking the companies (with the downside that there will be a lot of companies at the bottom of the list that are not a match at all).

Invite users to name other criteria even if we are currently not able to provide such features. Ideally, they are possible to extract from a company's website or public founder profiles on social media.

Output a json that specifies the filter criteria important to a user with the output variable. 
Also name the ranking criteria and suggest how to combine them to best meet the user's preferences.
"""

# default_prompt = """
# summarize the outcome of this search. The context is a list of company names followed by the company's description and a relevance score to the user query. 
# the report should mention the most important companies and how they compare to each other and contain the following sections:
# 1) Title: query text (summarized if more than 20 tokens)
# 2) Best matches: Naming of the 3 companies from the list that are most similar to the search query:
# - summarize what they are doing
# - name customers and technology if they are mentioned
# - compare them to each other and point out what they do differently or what is their unique selling proposition
# ----"""    

default_prompt = """
You are an assistant and your job is to help the user discover and analyze startups companies. You need to create a report with an analysis of companies relevant to the user's query. 
Use only information from the explicit list of companies provided! Don't teach the user about investments and don't provide general information.
Below is the user query followed by a list of company descriptions that match the user query. 
First advise the user on ways to improve the query followed by the token '-----'
If the user provide instructions, follow them to create a response. If not, create the following report:
The report should mention the most important companies and how they compare to each other and contain the following sections
- summarize what those companies they are doing (up to 20 words per company)
- name customers and technology if they are mentioned
- compare the companies to each other and point out what they do differently or what is their unique selling proposition
----"""    

query_finetune_prompt = """
You are an assistant and your job is to help the user discover and analyze startups companies. 
Below is a User Query. The user is looking for companies that match. You first need to understand what type of startups the user is looking for based on that query. 
Respond with a list of query keywords that will be used to retreive companies that are relevant to the user. If you are not sure, just respond with the user's original query.
"""

summarization_prompt = """
Below is the user query followed by a list of company descriptions that match the user query. Your job is to create a summary report that will help the user find relevant companies.
Use only information from the explicit list of companies provided!
If you don't have enough information in the user query, advise the user on how to improve the query. 
Don't teach the user about investments and don't provide general information.
The report should mention the most important companies and how they compare to each other and contain the following sections
- summarize what those companies they are doing
- name customers and technology if they are mentioned
- compare the companies to each other and point out what they do differently or what is their unique selling proposition
"""

clustering_prompt = """Please create a document with the following headings: 
H2: Recap of your question
H2: Clusters of relevant companies
H3: Name of Cluster 1
* List of the companies
H3: Name of Cluster 2
* List of the companies
H3: Name of Cluster x
* List of the companies
H2: How you could improve your search
As an input you will get 
- a list of 20 startup companies
- the original user query that was used to retrieve those companies via semantic search out of our database.
Detailed instructions for creating the chapters:
H1: Recap of your question
“I understand that you wanted to find companies in the area of (rephrase the “user query”). I researched our startup database to identify matching startup companies. Here is what I found. Happy to help you refine the search - see some suggestions at the end of the document.” 
H1: Clusters of relevant companies
Create clusters of the companies presented by grouping companies together using three main criteria: 
- Prio 1: solve the same problem
- Prio 2: target similar customers
- Prio 3: have the same business model (B2B, B2C, eCommerce & Marketplace, Manufacturing, SaaS, Advertising, Commission, Subscription)
The output should be no more than 5 clusters with the following conditions:
- All companies should be assigned to a cluster.
- Each company should only be part of one cluster and not show up in multiple clusters.
The format of the output should be:
H2: Name of Cluster in bold
One sentence that summarizes what the cluster is about.
List with all the companies in this cluster. Each list item should be structured like this:
* name of the company in bold (URL of the company, country location of the company): short summary summary of what the company does (max 30 tokens)
H1: How you could improve your search
“I hope you have already found some interesting matches. I am happy to let you refine your search. Here are some ideas on how to find matches in relation to your original question around (“user query”):”
* List of ideas on how to refine and improve the search"""




def on_prompt_selected():
    title = st.session_state.advanced_prompts_select
    new_prompt = get_prompt(title)
    if len(new_prompt)>0 and len(new_prompt[0])>0:
        print(f"Got a prompt for title {title}\n {new_prompt[0]}")
        st.session_state.prompt_title_editable = st.session_state.advanced_prompts_select
        st.session_state.advanced_prompt_content = new_prompt[0]
    else:
        print(f"No results for title {st.session_state.advanced_prompts_select}")