Keane Moraes commited on
Commit
9a105fd
·
1 Parent(s): 10296ed

speeding up the clustering

Browse files
Files changed (2) hide show
  1. app.py +3 -3
  2. utils.py +18 -16
app.py CHANGED
@@ -34,7 +34,7 @@ if file1 is not None and file2 is not None:
34
  with st.spinner("Flux capacitor is fluxing..."):
35
  embedder = utils.load_model()
36
  clutered = utils.cluster_based_on_topics(embedder, cleaned_text1, cleaned_text2, num_clusters=5)
37
- print(clutered)
38
 
39
  with st.spinner("Polishing up"):
40
  results = utils.generate_insights(topics, file1.name, file2.name, cleaned_text1, cleaned_text2, clutered)
@@ -43,5 +43,5 @@ if file1 is not None and file2 is not None:
43
  st.title("Insights generated")
44
 
45
  for result in results:
46
- with st.expander(result["name"]):
47
- st.write(result["description"])
 
34
  with st.spinner("Flux capacitor is fluxing..."):
35
  embedder = utils.load_model()
36
  clutered = utils.cluster_based_on_topics(embedder, cleaned_text1, cleaned_text2, num_clusters=5)
37
+ # print(clutered)
38
 
39
  with st.spinner("Polishing up"):
40
  results = utils.generate_insights(topics, file1.name, file2.name, cleaned_text1, cleaned_text2, clutered)
 
43
  st.title("Insights generated")
44
 
45
  for result in results:
46
+ with st.expander("See explanation"):
47
+ st.write(result)
utils.py CHANGED
@@ -2,7 +2,7 @@ import streamlit as st
2
  from keybert import KeyBERT
3
  from nltk.corpus import stopwords
4
  from transformers import AutoTokenizer
5
- import os, re
6
  import openai
7
  import spacy
8
  from sklearn.cluster import KMeans, AgglomerativeClustering
@@ -84,10 +84,10 @@ def cluster_based_on_topics(embedder, text1:str, text2:str, num_clusters=3):
84
  all_embeddings = np.concatenate((sentence_embeddings1, sentence_embeddings2), axis=0)
85
 
86
  # Normalize the embeddings to unit length
87
- all_embeddings = all_embeddings / np.linalg.norm(all_embeddings, axis=1, keepdims=True)
88
 
89
  # Perform agglomerative clustering
90
- clustering_model = AgglomerativeClustering(n_clusters=num_clusters)
91
  clustering_model.fit(all_embeddings)
92
  cluster_assignment = clustering_model.labels_
93
 
@@ -106,7 +106,7 @@ def generate_insights(topics:dict, name1:str, name2:str, text1:str, text2:str, c
106
 
107
  PROMPT = open("insights.prompt", "r").read()
108
 
109
- print(topics)
110
 
111
  PROMPT = PROMPT.replace("{{name1}}", name1)
112
  PROMPT = PROMPT.replace("{{name2}}", name2)
@@ -121,11 +121,11 @@ def generate_insights(topics:dict, name1:str, name2:str, text1:str, text2:str, c
121
 
122
  for cluster_id, sentences in clusters.items():
123
 
124
- print(cluster_id, " ", sentences)
125
  final_prompt = PROMPT.replace("{{sentences}}", "\n".join(sentences))
126
 
127
- # with open(f"prompter/insights_{cluster_id}.prompt", "w") as f:
128
- # f.write(final_prompt)
129
 
130
  # Generate insights for each cluster
131
  response = openai.Completion.create(
@@ -139,17 +139,19 @@ def generate_insights(topics:dict, name1:str, name2:str, text1:str, text2:str, c
139
  )
140
 
141
  text = response['choices'][0]['text']
142
- name_location = text.find("Name:")
143
- description_location = text.find("Description:")
144
- name_of_insight = text[name_location+6:name_location+6+text[name_location+6:].find("\n")]
145
- description = text[:name_location] + text[description_location+13:description_location+13+text[description_location+13:].find("\n")]
146
- final_insights.append({"name": name_of_insight, "description": description})
147
 
148
- # with open(f"prompter/insights_{cluster_id}.prompt", "a") as f:
149
- # f.write(text)
 
 
 
 
 
 
 
150
 
151
- # final_insights.append(text)
152
-
153
  return final_insights
154
 
155
 
 
2
  from keybert import KeyBERT
3
  from nltk.corpus import stopwords
4
  from transformers import AutoTokenizer
5
+ import os, re, json
6
  import openai
7
  import spacy
8
  from sklearn.cluster import KMeans, AgglomerativeClustering
 
84
  all_embeddings = np.concatenate((sentence_embeddings1, sentence_embeddings2), axis=0)
85
 
86
  # Normalize the embeddings to unit length
87
+ # all_embeddings = all_embeddings / np.linalg.norm(all_embeddings, axis=1, keepdims=True)
88
 
89
  # Perform agglomerative clustering
90
+ clustering_model = KMeans(n_clusters=num_clusters)
91
  clustering_model.fit(all_embeddings)
92
  cluster_assignment = clustering_model.labels_
93
 
 
106
 
107
  PROMPT = open("insights.prompt", "r").read()
108
 
109
+ # print(topics)
110
 
111
  PROMPT = PROMPT.replace("{{name1}}", name1)
112
  PROMPT = PROMPT.replace("{{name2}}", name2)
 
121
 
122
  for cluster_id, sentences in clusters.items():
123
 
124
+ # print(cluster_id, " ", sentences)
125
  final_prompt = PROMPT.replace("{{sentences}}", "\n".join(sentences))
126
 
127
+ with open(f"prompter/insights_{cluster_id}.prompt", "w") as f:
128
+ f.write(final_prompt)
129
 
130
  # Generate insights for each cluster
131
  response = openai.Completion.create(
 
139
  )
140
 
141
  text = response['choices'][0]['text']
142
+ # name_location = text.find("Name:")
 
 
 
 
143
 
144
+ # description_location = text.find("Description:")
145
+ # name_of_insight = text[name_location+6:name_location+6+text[name_location+6:].find("\n")]
146
+ # print(name_of_insight)
147
+ # description = text[:name_location] + text[description_location+13:description_location+13+text[description_location+13:].find("\n")]
148
+ # print(description)
149
+ # final_insights.append({"name": name_of_insight, "description": description})
150
+
151
+ with open(f"prompter/insights_{cluster_id}.prompt", "a") as f:
152
+ f.write(text)
153
 
154
+ final_insights.append(text)
 
155
  return final_insights
156
 
157