Spaces:
Build error
Build error
Keane Moraes
commited on
Commit
·
9a105fd
1
Parent(s):
10296ed
speeding up the clustering
Browse files
app.py
CHANGED
|
@@ -34,7 +34,7 @@ if file1 is not None and file2 is not None:
|
|
| 34 |
with st.spinner("Flux capacitor is fluxing..."):
|
| 35 |
embedder = utils.load_model()
|
| 36 |
clutered = utils.cluster_based_on_topics(embedder, cleaned_text1, cleaned_text2, num_clusters=5)
|
| 37 |
-
print(clutered)
|
| 38 |
|
| 39 |
with st.spinner("Polishing up"):
|
| 40 |
results = utils.generate_insights(topics, file1.name, file2.name, cleaned_text1, cleaned_text2, clutered)
|
|
@@ -43,5 +43,5 @@ if file1 is not None and file2 is not None:
|
|
| 43 |
st.title("Insights generated")
|
| 44 |
|
| 45 |
for result in results:
|
| 46 |
-
with st.expander(
|
| 47 |
-
st.write(result
|
|
|
|
| 34 |
with st.spinner("Flux capacitor is fluxing..."):
|
| 35 |
embedder = utils.load_model()
|
| 36 |
clutered = utils.cluster_based_on_topics(embedder, cleaned_text1, cleaned_text2, num_clusters=5)
|
| 37 |
+
# print(clutered)
|
| 38 |
|
| 39 |
with st.spinner("Polishing up"):
|
| 40 |
results = utils.generate_insights(topics, file1.name, file2.name, cleaned_text1, cleaned_text2, clutered)
|
|
|
|
| 43 |
st.title("Insights generated")
|
| 44 |
|
| 45 |
for result in results:
|
| 46 |
+
with st.expander("See explanation"):
|
| 47 |
+
st.write(result)
|
utils.py
CHANGED
|
@@ -2,7 +2,7 @@ import streamlit as st
|
|
| 2 |
from keybert import KeyBERT
|
| 3 |
from nltk.corpus import stopwords
|
| 4 |
from transformers import AutoTokenizer
|
| 5 |
-
import os, re
|
| 6 |
import openai
|
| 7 |
import spacy
|
| 8 |
from sklearn.cluster import KMeans, AgglomerativeClustering
|
|
@@ -84,10 +84,10 @@ def cluster_based_on_topics(embedder, text1:str, text2:str, num_clusters=3):
|
|
| 84 |
all_embeddings = np.concatenate((sentence_embeddings1, sentence_embeddings2), axis=0)
|
| 85 |
|
| 86 |
# Normalize the embeddings to unit length
|
| 87 |
-
all_embeddings = all_embeddings / np.linalg.norm(all_embeddings, axis=1, keepdims=True)
|
| 88 |
|
| 89 |
# Perform agglomerative clustering
|
| 90 |
-
clustering_model =
|
| 91 |
clustering_model.fit(all_embeddings)
|
| 92 |
cluster_assignment = clustering_model.labels_
|
| 93 |
|
|
@@ -106,7 +106,7 @@ def generate_insights(topics:dict, name1:str, name2:str, text1:str, text2:str, c
|
|
| 106 |
|
| 107 |
PROMPT = open("insights.prompt", "r").read()
|
| 108 |
|
| 109 |
-
print(topics)
|
| 110 |
|
| 111 |
PROMPT = PROMPT.replace("{{name1}}", name1)
|
| 112 |
PROMPT = PROMPT.replace("{{name2}}", name2)
|
|
@@ -121,11 +121,11 @@ def generate_insights(topics:dict, name1:str, name2:str, text1:str, text2:str, c
|
|
| 121 |
|
| 122 |
for cluster_id, sentences in clusters.items():
|
| 123 |
|
| 124 |
-
print(cluster_id, " ", sentences)
|
| 125 |
final_prompt = PROMPT.replace("{{sentences}}", "\n".join(sentences))
|
| 126 |
|
| 127 |
-
|
| 128 |
-
|
| 129 |
|
| 130 |
# Generate insights for each cluster
|
| 131 |
response = openai.Completion.create(
|
|
@@ -139,17 +139,19 @@ def generate_insights(topics:dict, name1:str, name2:str, text1:str, text2:str, c
|
|
| 139 |
)
|
| 140 |
|
| 141 |
text = response['choices'][0]['text']
|
| 142 |
-
name_location = text.find("Name:")
|
| 143 |
-
description_location = text.find("Description:")
|
| 144 |
-
name_of_insight = text[name_location+6:name_location+6+text[name_location+6:].find("\n")]
|
| 145 |
-
description = text[:name_location] + text[description_location+13:description_location+13+text[description_location+13:].find("\n")]
|
| 146 |
-
final_insights.append({"name": name_of_insight, "description": description})
|
| 147 |
|
| 148 |
-
#
|
| 149 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
-
|
| 152 |
-
|
| 153 |
return final_insights
|
| 154 |
|
| 155 |
|
|
|
|
| 2 |
from keybert import KeyBERT
|
| 3 |
from nltk.corpus import stopwords
|
| 4 |
from transformers import AutoTokenizer
|
| 5 |
+
import os, re, json
|
| 6 |
import openai
|
| 7 |
import spacy
|
| 8 |
from sklearn.cluster import KMeans, AgglomerativeClustering
|
|
|
|
| 84 |
all_embeddings = np.concatenate((sentence_embeddings1, sentence_embeddings2), axis=0)
|
| 85 |
|
| 86 |
# Normalize the embeddings to unit length
|
| 87 |
+
# all_embeddings = all_embeddings / np.linalg.norm(all_embeddings, axis=1, keepdims=True)
|
| 88 |
|
| 89 |
# Perform agglomerative clustering
|
| 90 |
+
clustering_model = KMeans(n_clusters=num_clusters)
|
| 91 |
clustering_model.fit(all_embeddings)
|
| 92 |
cluster_assignment = clustering_model.labels_
|
| 93 |
|
|
|
|
| 106 |
|
| 107 |
PROMPT = open("insights.prompt", "r").read()
|
| 108 |
|
| 109 |
+
# print(topics)
|
| 110 |
|
| 111 |
PROMPT = PROMPT.replace("{{name1}}", name1)
|
| 112 |
PROMPT = PROMPT.replace("{{name2}}", name2)
|
|
|
|
| 121 |
|
| 122 |
for cluster_id, sentences in clusters.items():
|
| 123 |
|
| 124 |
+
# print(cluster_id, " ", sentences)
|
| 125 |
final_prompt = PROMPT.replace("{{sentences}}", "\n".join(sentences))
|
| 126 |
|
| 127 |
+
with open(f"prompter/insights_{cluster_id}.prompt", "w") as f:
|
| 128 |
+
f.write(final_prompt)
|
| 129 |
|
| 130 |
# Generate insights for each cluster
|
| 131 |
response = openai.Completion.create(
|
|
|
|
| 139 |
)
|
| 140 |
|
| 141 |
text = response['choices'][0]['text']
|
| 142 |
+
# name_location = text.find("Name:")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
+
# description_location = text.find("Description:")
|
| 145 |
+
# name_of_insight = text[name_location+6:name_location+6+text[name_location+6:].find("\n")]
|
| 146 |
+
# print(name_of_insight)
|
| 147 |
+
# description = text[:name_location] + text[description_location+13:description_location+13+text[description_location+13:].find("\n")]
|
| 148 |
+
# print(description)
|
| 149 |
+
# final_insights.append({"name": name_of_insight, "description": description})
|
| 150 |
+
|
| 151 |
+
with open(f"prompter/insights_{cluster_id}.prompt", "a") as f:
|
| 152 |
+
f.write(text)
|
| 153 |
|
| 154 |
+
final_insights.append(text)
|
|
|
|
| 155 |
return final_insights
|
| 156 |
|
| 157 |
|