Spaces:
Build error
Build error
Keane Moraes
commited on
Commit
·
9a105fd
1
Parent(s):
10296ed
speeding up the clustering
Browse files
app.py
CHANGED
@@ -34,7 +34,7 @@ if file1 is not None and file2 is not None:
|
|
34 |
with st.spinner("Flux capacitor is fluxing..."):
|
35 |
embedder = utils.load_model()
|
36 |
clutered = utils.cluster_based_on_topics(embedder, cleaned_text1, cleaned_text2, num_clusters=5)
|
37 |
-
print(clutered)
|
38 |
|
39 |
with st.spinner("Polishing up"):
|
40 |
results = utils.generate_insights(topics, file1.name, file2.name, cleaned_text1, cleaned_text2, clutered)
|
@@ -43,5 +43,5 @@ if file1 is not None and file2 is not None:
|
|
43 |
st.title("Insights generated")
|
44 |
|
45 |
for result in results:
|
46 |
-
with st.expander(
|
47 |
-
st.write(result
|
|
|
34 |
with st.spinner("Flux capacitor is fluxing..."):
|
35 |
embedder = utils.load_model()
|
36 |
clutered = utils.cluster_based_on_topics(embedder, cleaned_text1, cleaned_text2, num_clusters=5)
|
37 |
+
# print(clutered)
|
38 |
|
39 |
with st.spinner("Polishing up"):
|
40 |
results = utils.generate_insights(topics, file1.name, file2.name, cleaned_text1, cleaned_text2, clutered)
|
|
|
43 |
st.title("Insights generated")
|
44 |
|
45 |
for result in results:
|
46 |
+
with st.expander("See explanation"):
|
47 |
+
st.write(result)
|
utils.py
CHANGED
@@ -2,7 +2,7 @@ import streamlit as st
|
|
2 |
from keybert import KeyBERT
|
3 |
from nltk.corpus import stopwords
|
4 |
from transformers import AutoTokenizer
|
5 |
-
import os, re
|
6 |
import openai
|
7 |
import spacy
|
8 |
from sklearn.cluster import KMeans, AgglomerativeClustering
|
@@ -84,10 +84,10 @@ def cluster_based_on_topics(embedder, text1:str, text2:str, num_clusters=3):
|
|
84 |
all_embeddings = np.concatenate((sentence_embeddings1, sentence_embeddings2), axis=0)
|
85 |
|
86 |
# Normalize the embeddings to unit length
|
87 |
-
all_embeddings = all_embeddings / np.linalg.norm(all_embeddings, axis=1, keepdims=True)
|
88 |
|
89 |
# Perform agglomerative clustering
|
90 |
-
clustering_model =
|
91 |
clustering_model.fit(all_embeddings)
|
92 |
cluster_assignment = clustering_model.labels_
|
93 |
|
@@ -106,7 +106,7 @@ def generate_insights(topics:dict, name1:str, name2:str, text1:str, text2:str, c
|
|
106 |
|
107 |
PROMPT = open("insights.prompt", "r").read()
|
108 |
|
109 |
-
print(topics)
|
110 |
|
111 |
PROMPT = PROMPT.replace("{{name1}}", name1)
|
112 |
PROMPT = PROMPT.replace("{{name2}}", name2)
|
@@ -121,11 +121,11 @@ def generate_insights(topics:dict, name1:str, name2:str, text1:str, text2:str, c
|
|
121 |
|
122 |
for cluster_id, sentences in clusters.items():
|
123 |
|
124 |
-
print(cluster_id, " ", sentences)
|
125 |
final_prompt = PROMPT.replace("{{sentences}}", "\n".join(sentences))
|
126 |
|
127 |
-
|
128 |
-
|
129 |
|
130 |
# Generate insights for each cluster
|
131 |
response = openai.Completion.create(
|
@@ -139,17 +139,19 @@ def generate_insights(topics:dict, name1:str, name2:str, text1:str, text2:str, c
|
|
139 |
)
|
140 |
|
141 |
text = response['choices'][0]['text']
|
142 |
-
name_location = text.find("Name:")
|
143 |
-
description_location = text.find("Description:")
|
144 |
-
name_of_insight = text[name_location+6:name_location+6+text[name_location+6:].find("\n")]
|
145 |
-
description = text[:name_location] + text[description_location+13:description_location+13+text[description_location+13:].find("\n")]
|
146 |
-
final_insights.append({"name": name_of_insight, "description": description})
|
147 |
|
148 |
-
#
|
149 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
-
|
152 |
-
|
153 |
return final_insights
|
154 |
|
155 |
|
|
|
2 |
from keybert import KeyBERT
|
3 |
from nltk.corpus import stopwords
|
4 |
from transformers import AutoTokenizer
|
5 |
+
import os, re, json
|
6 |
import openai
|
7 |
import spacy
|
8 |
from sklearn.cluster import KMeans, AgglomerativeClustering
|
|
|
84 |
all_embeddings = np.concatenate((sentence_embeddings1, sentence_embeddings2), axis=0)
|
85 |
|
86 |
# Normalize the embeddings to unit length
|
87 |
+
# all_embeddings = all_embeddings / np.linalg.norm(all_embeddings, axis=1, keepdims=True)
|
88 |
|
89 |
# Perform agglomerative clustering
|
90 |
+
clustering_model = KMeans(n_clusters=num_clusters)
|
91 |
clustering_model.fit(all_embeddings)
|
92 |
cluster_assignment = clustering_model.labels_
|
93 |
|
|
|
106 |
|
107 |
PROMPT = open("insights.prompt", "r").read()
|
108 |
|
109 |
+
# print(topics)
|
110 |
|
111 |
PROMPT = PROMPT.replace("{{name1}}", name1)
|
112 |
PROMPT = PROMPT.replace("{{name2}}", name2)
|
|
|
121 |
|
122 |
for cluster_id, sentences in clusters.items():
|
123 |
|
124 |
+
# print(cluster_id, " ", sentences)
|
125 |
final_prompt = PROMPT.replace("{{sentences}}", "\n".join(sentences))
|
126 |
|
127 |
+
with open(f"prompter/insights_{cluster_id}.prompt", "w") as f:
|
128 |
+
f.write(final_prompt)
|
129 |
|
130 |
# Generate insights for each cluster
|
131 |
response = openai.Completion.create(
|
|
|
139 |
)
|
140 |
|
141 |
text = response['choices'][0]['text']
|
142 |
+
# name_location = text.find("Name:")
|
|
|
|
|
|
|
|
|
143 |
|
144 |
+
# description_location = text.find("Description:")
|
145 |
+
# name_of_insight = text[name_location+6:name_location+6+text[name_location+6:].find("\n")]
|
146 |
+
# print(name_of_insight)
|
147 |
+
# description = text[:name_location] + text[description_location+13:description_location+13+text[description_location+13:].find("\n")]
|
148 |
+
# print(description)
|
149 |
+
# final_insights.append({"name": name_of_insight, "description": description})
|
150 |
+
|
151 |
+
with open(f"prompter/insights_{cluster_id}.prompt", "a") as f:
|
152 |
+
f.write(text)
|
153 |
|
154 |
+
final_insights.append(text)
|
|
|
155 |
return final_insights
|
156 |
|
157 |
|