Spaces:
Running
Running
hanoch.rahimi@gmail
commited on
Commit
·
a280e4d
1
Parent(s):
eab6925
added openai summarization and visual design
Browse files- app.py +21 -11
- requirements.txt +1 -0
- semsearch.pyproj +1 -0
- utils.py +33 -0
app.py
CHANGED
@@ -10,6 +10,7 @@ import streamlit as st
|
|
10 |
from transformers import AutoTokenizer
|
11 |
from sentence_transformers import SentenceTransformer
|
12 |
|
|
|
13 |
|
14 |
PINECONE_KEY = st.secrets["PINECONE_API_KEY"] # app.pinecone.io
|
15 |
OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"] # app.pinecone.io
|
@@ -48,23 +49,22 @@ retriever, tokenizer = init_models()
|
|
48 |
def card(name, description, score, data_type, region, country):
|
49 |
return st.markdown(f"""
|
50 |
<div class="container-fluid">
|
51 |
-
<div class="row align-items-start">
|
52 |
<div class="col-md-8 col-sm-8">
|
53 |
-
<b>{name}
|
54 |
-
<
|
55 |
-
|
56 |
-
<small>{description}</small>
|
57 |
-
[<b>Score: </b>{score}]
|
58 |
</span>
|
59 |
</div>
|
60 |
<div class="col-md-1 col-sm-1">
|
61 |
-
<
|
62 |
</div>
|
63 |
<div class="col-md-1 col-sm-1">
|
64 |
-
<
|
65 |
</div>
|
66 |
<div class="col-md-1 col-sm-1">
|
67 |
-
<
|
|
|
68 |
</div>
|
69 |
</div>
|
70 |
</div>
|
@@ -126,7 +126,13 @@ def run_query(query, prompt, scrape_boost, top_k , regions, countries):
|
|
126 |
if 'type' in match['metadata'] and match['metadata']['type']=='description-webcontent':
|
127 |
score = score * scrape_boost
|
128 |
answer = {'score': score}
|
129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
answer["description"] = match["metadata"]['description'] if "description" in match['metadata'] else ""
|
131 |
answer["metadata"] = match["metadata"]
|
132 |
results.append(answer)
|
@@ -150,9 +156,13 @@ def run_query(query, prompt, scrape_boost, top_k , regions, countries):
|
|
150 |
|
151 |
sorted_result = sorted(results, key=lambda x: x['score'], reverse=True)
|
152 |
|
|
|
|
|
|
|
|
|
153 |
for r in sorted_result:
|
154 |
company_name = r["name"]
|
155 |
-
description = r["description"]
|
156 |
score = round(r["score"], 4)
|
157 |
data_type = r["metadata"]["type"] if "type" in r["metadata"] else ""
|
158 |
region = r["metadata"]["region"]
|
|
|
10 |
from transformers import AutoTokenizer
|
11 |
from sentence_transformers import SentenceTransformer
|
12 |
|
13 |
+
from utils import get_companies_data
|
14 |
|
15 |
PINECONE_KEY = st.secrets["PINECONE_API_KEY"] # app.pinecone.io
|
16 |
OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"] # app.pinecone.io
|
|
|
49 |
def card(name, description, score, data_type, region, country):
|
50 |
return st.markdown(f"""
|
51 |
<div class="container-fluid">
|
52 |
+
<div class="row align-items-start" style="padding-bottom:10px;">
|
53 |
<div class="col-md-8 col-sm-8">
|
54 |
+
<b>{name}.</b>
|
55 |
+
<span style="">
|
56 |
+
{description}
|
|
|
|
|
57 |
</span>
|
58 |
</div>
|
59 |
<div class="col-md-1 col-sm-1">
|
60 |
+
<span>{region}</span>
|
61 |
</div>
|
62 |
<div class="col-md-1 col-sm-1">
|
63 |
+
<span>{country}</span>
|
64 |
</div>
|
65 |
<div class="col-md-1 col-sm-1">
|
66 |
+
<span>{data_type}</span>
|
67 |
+
<span>[Score: {score}</span>
|
68 |
</div>
|
69 |
</div>
|
70 |
</div>
|
|
|
126 |
if 'type' in match['metadata'] and match['metadata']['type']=='description-webcontent':
|
127 |
score = score * scrape_boost
|
128 |
answer = {'score': score}
|
129 |
+
if match['id'].endswith("_description"):
|
130 |
+
answer['id'] = match['id'][:-12]
|
131 |
+
elif match['id'].endswith("_webcontent"):
|
132 |
+
answer['id'] = match['id'][:-11]
|
133 |
+
else:
|
134 |
+
answer['id'] = match['id']
|
135 |
+
answer["name"] = match["metadata"]['company_name']
|
136 |
answer["description"] = match["metadata"]['description'] if "description" in match['metadata'] else ""
|
137 |
answer["metadata"] = match["metadata"]
|
138 |
results.append(answer)
|
|
|
156 |
|
157 |
sorted_result = sorted(results, key=lambda x: x['score'], reverse=True)
|
158 |
|
159 |
+
|
160 |
+
st.markdown("<h2>Related companies</h2>", unsafe_allow_html=True)
|
161 |
+
#df = get_companies_data([r['id'] for r in results])
|
162 |
+
|
163 |
for r in sorted_result:
|
164 |
company_name = r["name"]
|
165 |
+
description = r["description"] #.replace(company_name, f"<mark>{company_name}</mark>")
|
166 |
score = round(r["score"], 4)
|
167 |
data_type = r["metadata"]["type"] if "type" in r["metadata"] else ""
|
168 |
region = r["metadata"]["region"]
|
requirements.txt
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
langchain
|
2 |
openai
|
3 |
pinecone-client
|
|
|
4 |
sentence_transformers
|
5 |
transformers
|
|
|
1 |
langchain
|
2 |
openai
|
3 |
pinecone-client
|
4 |
+
psycopg2-binary==2.8.6
|
5 |
sentence_transformers
|
6 |
transformers
|
semsearch.pyproj
CHANGED
@@ -35,6 +35,7 @@
|
|
35 |
</ItemGroup>
|
36 |
<ItemGroup>
|
37 |
<Compile Include="app.py" />
|
|
|
38 |
</ItemGroup>
|
39 |
<ItemGroup>
|
40 |
<Folder Include=".streamlit" />
|
|
|
35 |
</ItemGroup>
|
36 |
<ItemGroup>
|
37 |
<Compile Include="app.py" />
|
38 |
+
<Compile Include="utils.py" />
|
39 |
</ItemGroup>
|
40 |
<ItemGroup>
|
41 |
<Folder Include=".streamlit" />
|
utils.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import psycopg2
|
3 |
+
from psycopg2 import extras
|
4 |
+
import streamlit as st
|
5 |
+
|
6 |
+
def create_connection():
|
7 |
+
host = st.secrets["RAIZED_DB_HOST"]
|
8 |
+
user = st.secrets["RAIZED_DB_USER"]
|
9 |
+
pswd = st.secrets["RAIZED_DB_PASSWORD"]
|
10 |
+
dbname = 'raized-central'
|
11 |
+
return psycopg2.connect(
|
12 |
+
database=dbname,
|
13 |
+
user=user,
|
14 |
+
password=pswd,
|
15 |
+
host=host,
|
16 |
+
port=5432
|
17 |
+
)
|
18 |
+
|
19 |
+
def get_companies_data(company_ids=[]):
|
20 |
+
with create_connection() as con:
|
21 |
+
cur = con.cursor(cursor_factory=extras.NamedTupleCursor)
|
22 |
+
cmd = '''
|
23 |
+
SELECT company_id, company_name, description_long, country_name, region, mapped_cat, website_url, next_funding_tag_inv
|
24 |
+
FROM central.v_companies_latest_mat_fe
|
25 |
+
WHERE company_id in %(company_ids)s
|
26 |
+
'''
|
27 |
+
params = {"company_ids": tuple(company_ids)}
|
28 |
+
cur.execute(cmd, params)
|
29 |
+
column_names = [desc[0] for desc in cur.description]
|
30 |
+
data = cur.fetchall()
|
31 |
+
# Create a DataFrame from the results and column names
|
32 |
+
df = pd.DataFrame(data, columns=column_names)
|
33 |
+
return df
|