Spaces:

lordvader31
/

text-matching

Build error

App Files Files Community

Keane Moraes commited on May 26, 2023

Commit

9a105fd

1 Parent(s): 10296ed

speeding up the clustering

Browse files

Files changed (2) hide show

app.py +3 -3
utils.py +18 -16

app.py CHANGED Viewed

@@ -34,7 +34,7 @@ if file1 is not None and file2 is not None:
     with st.spinner("Flux capacitor is fluxing..."):
         embedder = utils.load_model()
         clutered = utils.cluster_based_on_topics(embedder, cleaned_text1, cleaned_text2, num_clusters=5)
-        print(clutered)
     with st.spinner("Polishing up"):
         results = utils.generate_insights(topics, file1.name, file2.name, cleaned_text1, cleaned_text2, clutered)
@@ -43,5 +43,5 @@ if file1 is not None and file2 is not None:
     st.title("Insights generated")
     for result in results:
-        with st.expander(result["name"]):
-            st.write(result["description"])

     with st.spinner("Flux capacitor is fluxing..."):
         embedder = utils.load_model()
         clutered = utils.cluster_based_on_topics(embedder, cleaned_text1, cleaned_text2, num_clusters=5)
+        # print(clutered)
     with st.spinner("Polishing up"):
         results = utils.generate_insights(topics, file1.name, file2.name, cleaned_text1, cleaned_text2, clutered)
     st.title("Insights generated")
     for result in results:
+        with st.expander("See explanation"):
+            st.write(result)

utils.py CHANGED Viewed

@@ -2,7 +2,7 @@ import streamlit as st
 from keybert import KeyBERT
 from nltk.corpus import stopwords
 from transformers import AutoTokenizer
-import os, re
 import openai
 import spacy
 from sklearn.cluster import KMeans, AgglomerativeClustering
@@ -84,10 +84,10 @@ def cluster_based_on_topics(embedder, text1:str, text2:str, num_clusters=3):
 	all_embeddings = np.concatenate((sentence_embeddings1, sentence_embeddings2), axis=0)
 	# Normalize the embeddings to unit length
-	all_embeddings = all_embeddings /  np.linalg.norm(all_embeddings, axis=1, keepdims=True)
 	# Perform agglomerative clustering
-	clustering_model = AgglomerativeClustering(n_clusters=num_clusters)
 	clustering_model.fit(all_embeddings)
 	cluster_assignment = clustering_model.labels_
@@ -106,7 +106,7 @@ def generate_insights(topics:dict, name1:str, name2:str, text1:str, text2:str, c
 	PROMPT = open("insights.prompt", "r").read()
-	print(topics)
 	PROMPT = PROMPT.replace("{{name1}}", name1)
 	PROMPT = PROMPT.replace("{{name2}}", name2)
@@ -121,11 +121,11 @@ def generate_insights(topics:dict, name1:str, name2:str, text1:str, text2:str, c
 	for cluster_id, sentences in clusters.items():
-		print(cluster_id, " ", sentences)
 		final_prompt = PROMPT.replace("{{sentences}}", "\n".join(sentences))
-		# with open(f"prompter/insights_{cluster_id}.prompt", "w") as f:
-		# 	f.write(final_prompt)
 		# Generate insights for each cluster
 		response = openai.Completion.create(
@@ -139,17 +139,19 @@ def generate_insights(topics:dict, name1:str, name2:str, text1:str, text2:str, c
 		)
 		text = response['choices'][0]['text']
-		name_location = text.find("Name:")
-		description_location = text.find("Description:")
-		name_of_insight = text[name_location+6:name_location+6+text[name_location+6:].find("\n")]
-		description = text[:name_location] + text[description_location+13:description_location+13+text[description_location+13:].find("\n")]
-		final_insights.append({"name": name_of_insight, "description": description})
-		# with open(f"prompter/insights_{cluster_id}.prompt", "a") as f:
-		# 	f.write(text)
-		# final_insights.append(text)
 	return final_insights

 from keybert import KeyBERT
 from nltk.corpus import stopwords
 from transformers import AutoTokenizer
+import os, re, json
 import openai
 import spacy
 from sklearn.cluster import KMeans, AgglomerativeClustering
 	all_embeddings = np.concatenate((sentence_embeddings1, sentence_embeddings2), axis=0)
 	# Normalize the embeddings to unit length
+	# all_embeddings = all_embeddings /  np.linalg.norm(all_embeddings, axis=1, keepdims=True)
 	# Perform agglomerative clustering
+	clustering_model = KMeans(n_clusters=num_clusters)
 	clustering_model.fit(all_embeddings)
 	cluster_assignment = clustering_model.labels_
 	PROMPT = open("insights.prompt", "r").read()
+	# print(topics)
 	PROMPT = PROMPT.replace("{{name1}}", name1)
 	PROMPT = PROMPT.replace("{{name2}}", name2)
 	for cluster_id, sentences in clusters.items():
+		# print(cluster_id, " ", sentences)
 		final_prompt = PROMPT.replace("{{sentences}}", "\n".join(sentences))
+		with open(f"prompter/insights_{cluster_id}.prompt", "w") as f:
+			f.write(final_prompt)
 		# Generate insights for each cluster
 		response = openai.Completion.create(
 		)
 		text = response['choices'][0]['text']
+		# name_location = text.find("Name:")
+		# description_location = text.find("Description:")
+		# name_of_insight = text[name_location+6:name_location+6+text[name_location+6:].find("\n")]
+		# print(name_of_insight)
+		# description = text[:name_location] + text[description_location+13:description_location+13+text[description_location+13:].find("\n")]
+		# print(description)
+		# final_insights.append({"name": name_of_insight, "description": description})
+		with open(f"prompter/insights_{cluster_id}.prompt", "a") as f:
+			f.write(text)
+		final_insights.append(text)
 	return final_insights