Jolie80 taskswithcode commited on
Commit
cebe313
·
0 Parent(s):

Duplicate from taskswithcode/semantic_clustering

Browse files

Co-authored-by: RA <[email protected]>

.gitattributes ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.npy filter=lfs diff=lfs merge=lfs -text
13
+ *.npz filter=lfs diff=lfs merge=lfs -text
14
+ *.onnx filter=lfs diff=lfs merge=lfs -text
15
+ *.ot filter=lfs diff=lfs merge=lfs -text
16
+ *.parquet filter=lfs diff=lfs merge=lfs -text
17
+ *.pickle filter=lfs diff=lfs merge=lfs -text
18
+ *.pkl filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pt filter=lfs diff=lfs merge=lfs -text
21
+ *.pth filter=lfs diff=lfs merge=lfs -text
22
+ *.rar filter=lfs diff=lfs merge=lfs -text
23
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
24
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
25
+ *.tflite filter=lfs diff=lfs merge=lfs -text
26
+ *.tgz filter=lfs diff=lfs merge=lfs -text
27
+ *.wasm filter=lfs diff=lfs merge=lfs -text
28
+ *.xz filter=lfs diff=lfs merge=lfs -text
29
+ *.zip filter=lfs diff=lfs merge=lfs -text
30
+ *.zst filter=lfs diff=lfs merge=lfs -text
31
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
32
+ text-similarity-davinci-001imdb_sent_embed.json filter=lfs diff=lfs merge=lfs -text
33
+ text-similarity-davinci-001larger_test_embed.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Semantic Clustering
3
+ emoji: 🏃
4
+ colorFrom: indigo
5
+ colorTo: purple
6
+ sdk: streamlit
7
+ sdk_version: 1.10.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ duplicated_from: taskswithcode/semantic_clustering
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import sys
3
+ import streamlit as st
4
+ import string
5
+ from io import StringIO
6
+ import pdb
7
+ import json
8
+ from twc_embeddings import HFModel,SimCSEModel,SGPTModel,CausalLMModel,SGPTQnAModel
9
+ from twc_openai_embeddings import OpenAIModel
10
+ from twc_clustering import TWCClustering
11
+ import torch
12
+ import requests
13
+ import socket
14
+
15
+
16
+ MAX_INPUT = 100
17
+
18
+ SEM_SIMILARITY="1"
19
+ DOC_RETRIEVAL="2"
20
+ CLUSTERING="3"
21
+
22
+
23
+ use_case = {"1":"Finding similar phrases/sentences","2":"Retrieving semantically matching information to a query. It may not be a factual match","3":"Clustering"}
24
+ use_case_url = {"1":"https://huggingface.co/spaces/taskswithcode/semantic_similarity","2":"https://huggingface.co/spaces/taskswithcode/semantic_search","3":""}
25
+
26
+
27
+
28
+ from transformers import BertTokenizer, BertForMaskedLM
29
+
30
+
31
+ APP_NAME = "hf/semantic_clustering"
32
+ INFO_URL = "https://www.taskswithcode.com/stats/"
33
+
34
+
35
+
36
+
37
+
38
+ def get_views(action):
39
+ ret_val = 0
40
+ hostname = socket.gethostname()
41
+ ip_address = socket.gethostbyname(hostname)
42
+ if ("view_count" not in st.session_state):
43
+ try:
44
+ app_info = {'name': APP_NAME,"action":action,"host":hostname,"ip":ip_address}
45
+ res = requests.post(INFO_URL, json = app_info).json()
46
+ print(res)
47
+ data = res["count"]
48
+ except:
49
+ data = 0
50
+ ret_val = data
51
+ st.session_state["view_count"] = data
52
+ else:
53
+ ret_val = st.session_state["view_count"]
54
+ if (action != "init"):
55
+ app_info = {'name': APP_NAME,"action":action,"host":hostname,"ip":ip_address}
56
+ res = requests.post(INFO_URL, json = app_info).json()
57
+ return "{:,}".format(ret_val)
58
+
59
+
60
+
61
+
62
+ def construct_model_info_for_display(model_names):
63
+ options_arr = []
64
+ markdown_str = f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\"><br/><b>Models evaluated ({len(model_names)})</b><br/><i>The selected models satisfy one or more of the following (1) state-of-the-art (2) the most downloaded models on Hugging Face (3) Large Language Models (e.g. GPT-3)</i></div>"
65
+ markdown_str += f"<div style=\"font-size:2px; color: #2f2f2f; text-align: left\"><br/></div>"
66
+ for node in model_names:
67
+ options_arr .append(node["name"])
68
+ if (node["mark"] == "True"):
69
+ markdown_str += f"<div style=\"font-size:16px; color: #5f5f5f; text-align: left\">&nbsp;•&nbsp;Model:&nbsp;<a href=\'{node['paper_url']}\' target='_blank'>{node['name']}</a><br/>&nbsp;&nbsp;&nbsp;&nbsp;Code released by:&nbsp;<a href=\'{node['orig_author_url']}\' target='_blank'>{node['orig_author']}</a><br/>&nbsp;&nbsp;&nbsp;&nbsp;Model info:&nbsp;<a href=\'{node['sota_info']['sota_link']}\' target='_blank'>{node['sota_info']['task']}</a></div>"
70
+ if ("Note" in node):
71
+ markdown_str += f"<div style=\"font-size:16px; color: #a91212; text-align: left\">&nbsp;&nbsp;&nbsp;&nbsp;{node['Note']}<a href=\'{node['alt_url']}\' target='_blank'>link</a></div>"
72
+ markdown_str += "<div style=\"font-size:16px; color: #5f5f5f; text-align: left\"><br/></div>"
73
+
74
+ markdown_str += "<div style=\"font-size:12px; color: #9f9f9f; text-align: left\"><b>Note:</b><br/>•&nbsp;Uploaded files are loaded into non-persistent memory for the duration of the computation. They are not cached</div>"
75
+ limit = "{:,}".format(MAX_INPUT)
76
+ markdown_str += f"<div style=\"font-size:12px; color: #9f9f9f; text-align: left\">•&nbsp;User uploaded file has a maximum limit of {limit} sentences.</div>"
77
+ return options_arr,markdown_str
78
+
79
+
80
+ st.set_page_config(page_title='TWC - Compare popular/state-of-the-art models for semantic clustering using sentence embeddings', page_icon="logo.jpg", layout='centered', initial_sidebar_state='auto',
81
+ menu_items={
82
+ 'About': 'This app was created by taskswithcode. http://taskswithcode.com'
83
+
84
+ })
85
+ col,pad = st.columns([85,15])
86
+
87
+ with col:
88
+ st.image("long_form_logo_with_icon.png")
89
+
90
+
91
+ @st.experimental_memo
92
+ def load_model(model_name,model_class,load_model_name):
93
+ try:
94
+ ret_model = None
95
+ obj_class = globals()[model_class]
96
+ ret_model = obj_class()
97
+ ret_model.init_model(load_model_name)
98
+ assert(ret_model is not None)
99
+ except Exception as e:
100
+ st.error(f"Unable to load model class:{model_class} model_name: {model_name} load_model_name: {load_model_name} {str(e)}")
101
+ pass
102
+ return ret_model
103
+
104
+
105
+
106
+ @st.experimental_memo
107
+ def cached_compute_similarity(input_file_name,sentences,_model,model_name,threshold,_cluster,clustering_type):
108
+ texts,embeddings = _model.compute_embeddings(input_file_name,sentences,is_file=False)
109
+ results = _cluster.cluster(None,texts,embeddings,threshold,clustering_type)
110
+ return results
111
+
112
+
113
+ def uncached_compute_similarity(input_file_name,sentences,_model,model_name,threshold,cluster,clustering_type):
114
+ with st.spinner('Computing vectors for sentences'):
115
+ texts,embeddings = _model.compute_embeddings(input_file_name,sentences,is_file=False)
116
+ results = cluster.cluster(None,texts,embeddings,threshold,clustering_type)
117
+ #st.success("Similarity computation complete")
118
+ return results
119
+
120
+ DEFAULT_HF_MODEL = "sentence-transformers/paraphrase-MiniLM-L6-v2"
121
+ def get_model_info(model_names,model_name):
122
+ for node in model_names:
123
+ if (model_name == node["name"]):
124
+ return node,model_name
125
+ return get_model_info(model_names,DEFAULT_HF_MODEL)
126
+
127
+
128
+ def run_test(model_names,model_name,input_file_name,sentences,display_area,threshold,user_uploaded,custom_model,clustering_type):
129
+ display_area.text("Loading model:" + model_name)
130
+ #Note. model_name may get mapped to new name in the call below for custom models
131
+ orig_model_name = model_name
132
+ model_info,model_name = get_model_info(model_names,model_name)
133
+ if (model_name != orig_model_name):
134
+ load_model_name = orig_model_name
135
+ else:
136
+ load_model_name = model_info["model"]
137
+ if ("Note" in model_info):
138
+ fail_link = f"{model_info['Note']} [link]({model_info['alt_url']})"
139
+ display_area.write(fail_link)
140
+ if (user_uploaded and "custom_load" in model_info and model_info["custom_load"] == "False"):
141
+ fail_link = f"{model_info['Note']} [link]({model_info['alt_url']})"
142
+ display_area.write(fail_link)
143
+ return {"error":fail_link}
144
+ model = load_model(model_name,model_info["class"],load_model_name)
145
+ display_area.text("Model " + model_name + " load complete")
146
+ try:
147
+ if (user_uploaded):
148
+ results = uncached_compute_similarity(input_file_name,sentences,model,model_name,threshold,st.session_state["cluster"],clustering_type)
149
+ else:
150
+ display_area.text("Computing vectors for sentences")
151
+ results = cached_compute_similarity(input_file_name,sentences,model,model_name,threshold,st.session_state["cluster"],clustering_type)
152
+ display_area.text("Similarity computation complete")
153
+ return results
154
+
155
+ except Exception as e:
156
+ st.error("Some error occurred during prediction" + str(e))
157
+ st.stop()
158
+ return {}
159
+
160
+
161
+
162
+
163
+
164
+ def display_results(orig_sentences,results,response_info,app_mode,model_name):
165
+ main_sent = f"<div style=\"font-size:14px; color: #2f2f2f; text-align: left\">{response_info}<br/><br/></div>"
166
+ main_sent += f"<div style=\"font-size:14px; color: #2f2f2f; text-align: left\">Showing results for model:&nbsp;<b>{model_name}</b></div>"
167
+ score_text = "cosine distance"
168
+ main_sent += f"<div style=\"font-size:14px; color: #6f6f6f; text-align: left\">Clustering by {score_text}.&nbsp;<b>{len(results['clusters'])} clusters</b>.&nbsp;&nbsp;mean:{results['info']['mean']:.2f};&nbsp;std:{results['info']['std']:.2f};&nbsp;current threshold:{results['info']['current_threshold']}<br/>Threshold hints:{str(results['info']['zscores'])}<br/>Overlap stats(overlap,freq):{str(results['info']['overlap'])}</div>"
169
+ body_sent = []
170
+ download_data = {}
171
+ for i in range(len(results["clusters"])):
172
+ pivot_index = results["clusters"][i]["pivot_index"]
173
+ pivot_sent = orig_sentences[pivot_index]
174
+ pivot_index += 1
175
+ d_cluster = {}
176
+ download_data[i + 1] = d_cluster
177
+ d_cluster["pivot"] = {"pivot_index":pivot_index,"sent":pivot_sent,"children":{}}
178
+ body_sent.append(f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\">{pivot_index}]&nbsp;{pivot_sent}&nbsp;<b><i>(Cluster {i+1})</i></b>&nbsp;&nbsp;</div>")
179
+ neighs_dict = results["clusters"][i]["neighs"]
180
+ for key in neighs_dict:
181
+ cosine_dist = neighs_dict[key]
182
+ child_index = key
183
+ sentence = orig_sentences[child_index]
184
+ child_index += 1
185
+ body_sent.append(f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\">{child_index}]&nbsp;{sentence}&nbsp;&nbsp;&nbsp;<b>{cosine_dist:.2f}</b></div>")
186
+ d_cluster["pivot"]["children"][sentence] = f"{cosine_dist:.2f}"
187
+ body_sent.append(f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\">&nbsp;</div>")
188
+ main_sent = main_sent + "\n" + '\n'.join(body_sent)
189
+ st.markdown(main_sent,unsafe_allow_html=True)
190
+ st.session_state["download_ready"] = json.dumps(download_data,indent=4)
191
+ get_views("submit")
192
+
193
+
194
+ def init_session():
195
+ if ("model_name" not in st.session_state):
196
+ st.session_state["model_name"] = "ss_test"
197
+ st.session_state["download_ready"] = None
198
+ st.session_state["model_name"] = "ss_test"
199
+ st.session_state["threshold"] = 1.5
200
+ st.session_state["file_name"] = "default"
201
+ st.session_state["overlapped"] = "overlapped"
202
+ st.session_state["cluster"] = TWCClustering()
203
+ else:
204
+ print("Skipping init session")
205
+
206
+ def app_main(app_mode,example_files,model_name_files,clus_types):
207
+ init_session()
208
+ with open(example_files) as fp:
209
+ example_file_names = json.load(fp)
210
+ with open(model_name_files) as fp:
211
+ model_names = json.load(fp)
212
+ with open(clus_types) as fp:
213
+ cluster_types = json.load(fp)
214
+ curr_use_case = use_case[app_mode].split(".")[0]
215
+ st.markdown("<h5 style='text-align: center;'>Compare popular/state-of-the-art models for semantic clustering using sentence embeddings</h5>", unsafe_allow_html=True)
216
+ st.markdown(f"<p style='font-size:14px; color: #4f4f4f; text-align: center'><i>Or compare your own model with state-of-the-art/popular models</p>", unsafe_allow_html=True)
217
+ st.markdown(f"<div style='color: #4f4f4f; text-align: left'>Use cases for sentence embeddings<br/>&nbsp;&nbsp;&nbsp;•&nbsp;&nbsp;<a href=\'{use_case_url['1']}\' target='_blank'>{use_case['1']}</a><br/>&nbsp;&nbsp;&nbsp;•&nbsp;&nbsp;<a href=\'{use_case_url['2']}\' target='_blank'>{use_case['2']}</a><br/>&nbsp;&nbsp;&nbsp;•&nbsp;&nbsp;{use_case['3']}<br/><i>This app illustrates <b>'{curr_use_case}'</b> use case</i></div>", unsafe_allow_html=True)
218
+ st.markdown(f"<div style='color: #9f9f9f; text-align: right'>views:&nbsp;{get_views('init')}</div>", unsafe_allow_html=True)
219
+
220
+
221
+ try:
222
+
223
+
224
+ with st.form('twc_form'):
225
+
226
+ step1_line = "Upload text file(one sentence in a line) or choose an example text file below"
227
+ if (app_mode == DOC_RETRIEVAL):
228
+ step1_line += ". The first line is treated as the query"
229
+ uploaded_file = st.file_uploader(step1_line, type=".txt")
230
+
231
+ selected_file_index = st.selectbox(label=f'Example files ({len(example_file_names)})',
232
+ options = list(dict.keys(example_file_names)), index=0, key = "twc_file")
233
+ st.write("")
234
+ options_arr,markdown_str = construct_model_info_for_display(model_names)
235
+ selection_label = 'Select Model'
236
+ selected_model = st.selectbox(label=selection_label,
237
+ options = options_arr, index=0, key = "twc_model")
238
+ st.write("")
239
+ custom_model_selection = st.text_input("Model not listed above? Type any Hugging Face sentence embedding model name ", "",key="custom_model")
240
+ hf_link_str = "<div style=\"font-size:12px; color: #9f9f9f; text-align: left\"><a href='https://huggingface.co/models?pipeline_tag=sentence-similarity' target = '_blank'>List of Hugging Face sentence embedding models</a><br/><br/><br/></div>"
241
+ st.markdown(hf_link_str, unsafe_allow_html=True)
242
+ threshold = st.number_input('Choose a zscore threshold (number of std devs from mean)',value=st.session_state["threshold"],min_value = 0.0,step=.01)
243
+ st.write("")
244
+ clustering_type = st.selectbox(label=f'Select type of clustering',
245
+ options = list(dict.keys(cluster_types)), index=0, key = "twc_cluster_types")
246
+ st.write("")
247
+ submit_button = st.form_submit_button('Run')
248
+
249
+
250
+ input_status_area = st.empty()
251
+ display_area = st.empty()
252
+ if submit_button:
253
+ start = time.time()
254
+ if uploaded_file is not None:
255
+ st.session_state["file_name"] = uploaded_file.name
256
+ sentences = StringIO(uploaded_file.getvalue().decode("utf-8")).read()
257
+ else:
258
+ st.session_state["file_name"] = example_file_names[selected_file_index]["name"]
259
+ sentences = open(example_file_names[selected_file_index]["name"]).read()
260
+ sentences = sentences.split("\n")[:-1]
261
+ if (len(sentences) > MAX_INPUT):
262
+ st.info(f"Input sentence count exceeds maximum sentence limit. First {MAX_INPUT} out of {len(sentences)} sentences chosen")
263
+ sentences = sentences[:MAX_INPUT]
264
+ if (len(custom_model_selection) != 0):
265
+ run_model = custom_model_selection
266
+ else:
267
+ run_model = selected_model
268
+ st.session_state["model_name"] = selected_model
269
+ st.session_state["threshold"] = threshold
270
+ st.session_state["overlapped"] = cluster_types[clustering_type]["type"]
271
+ results = run_test(model_names,run_model,st.session_state["file_name"],sentences,display_area,threshold,(uploaded_file is not None),(len(custom_model_selection) != 0),cluster_types[clustering_type]["type"])
272
+ display_area.empty()
273
+ with display_area.container():
274
+ if ("error" in results):
275
+ st.error(results["error"])
276
+ else:
277
+ device = 'GPU' if torch.cuda.is_available() else 'CPU'
278
+ response_info = f"Computation time on {device}: {time.time() - start:.2f} secs for {len(sentences)} sentences"
279
+ if (len(custom_model_selection) != 0):
280
+ st.info("Custom model overrides model selection in step 2 above. So please clear the custom model text box to choose models from step 2")
281
+ display_results(sentences,results,response_info,app_mode,run_model)
282
+ #st.json(results)
283
+ st.download_button(
284
+ label="Download results as json",
285
+ data= st.session_state["download_ready"] if st.session_state["download_ready"] != None else "",
286
+ disabled = False if st.session_state["download_ready"] != None else True,
287
+ file_name= (st.session_state["model_name"] + "_" + str(st.session_state["threshold"]) + "_" + st.session_state["overlapped"] + "_" + '_'.join(st.session_state["file_name"].split(".")[:-1]) + ".json").replace("/","_"),
288
+ mime='text/json',
289
+ key ="download"
290
+ )
291
+
292
+
293
+
294
+ except Exception as e:
295
+ st.error("Some error occurred during loading" + str(e))
296
+ st.stop()
297
+
298
+ st.markdown(markdown_str, unsafe_allow_html=True)
299
+
300
+
301
+
302
+ if __name__ == "__main__":
303
+ #print("comand line input:",len(sys.argv),str(sys.argv))
304
+ #app_main(sys.argv[1],sys.argv[2],sys.argv[3])
305
+ #app_main("1","sim_app_examples.json","sim_app_models.json")
306
+ app_main("3","clus_app_examples.json","clus_app_models.json","clus_app_clustypes.json")
307
+
clus_app_clustypes.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "Overlapped clustering (cluster size determined by zscore)": {"type":"overlapped"},
3
+ "Non-overlapped clustering (overlapped clusters aggregated)":{"type":"non-overlapped"}
4
+ }
clus_app_examples.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "Machine learning terms (phrases test)": {"name":"small_test.txt"},
3
+ "Customer feedback mixed with noise":{"name":"larger_test.txt"},
4
+ "Movie reviews": {"name":"imdb_sent.txt"}
5
+ }
clus_app_models.json ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+
3
+ { "name":"sentence-transformers/all-MiniLM-L6-v2",
4
+ "model":"sentence-transformers/all-MiniLM-L6-v2",
5
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
6
+ "orig_author_url":"https://github.com/UKPLab",
7
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
8
+ "sota_info": {
9
+ "task":"Over 3.8 million downloads from Huggingface",
10
+ "sota_link":"https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2"
11
+ },
12
+ "paper_url":"https://arxiv.org/abs/1908.10084",
13
+ "mark":"True",
14
+ "class":"HFModel"},
15
+ { "name":"sentence-transformers/paraphrase-MiniLM-L6-v2",
16
+ "model":"sentence-transformers/paraphrase-MiniLM-L6-v2",
17
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
18
+ "orig_author_url":"https://github.com/UKPLab",
19
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
20
+ "sota_info": {
21
+ "task":"Over 2 million downloads from Huggingface",
22
+ "sota_link":"https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L6-v2"
23
+ },
24
+ "paper_url":"https://arxiv.org/abs/1908.10084",
25
+ "mark":"True",
26
+ "class":"HFModel"},
27
+ { "name":"sentence-transformers/bert-base-nli-mean-tokens",
28
+ "model":"sentence-transformers/bert-base-nli-mean-tokens",
29
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
30
+ "orig_author_url":"https://github.com/UKPLab",
31
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
32
+ "sota_info": {
33
+ "task":"Over 700,000 downloads from Huggingface",
34
+ "sota_link":"https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens"
35
+ },
36
+ "paper_url":"https://arxiv.org/abs/1908.10084",
37
+ "mark":"True",
38
+ "class":"HFModel"},
39
+ { "name":"sentence-transformers/all-mpnet-base-v2",
40
+ "model":"sentence-transformers/all-mpnet-base-v2",
41
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
42
+ "orig_author_url":"https://github.com/UKPLab",
43
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
44
+ "sota_info": {
45
+ "task":"Over 500,000 downloads from Huggingface",
46
+ "sota_link":"https://huggingface.co/sentence-transformers/all-mpnet-base-v2"
47
+ },
48
+ "paper_url":"https://arxiv.org/abs/1908.10084",
49
+ "mark":"True",
50
+ "class":"HFModel"},
51
+ { "name":"sentence-transformers/all-MiniLM-L12-v2",
52
+ "model":"sentence-transformers/all-MiniLM-L12-v2",
53
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
54
+ "orig_author_url":"https://github.com/UKPLab",
55
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
56
+ "sota_info": {
57
+ "task":"Over 500,000 downloads from Huggingface",
58
+ "sota_link":"https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2"
59
+ },
60
+ "paper_url":"https://arxiv.org/abs/1908.10084",
61
+ "mark":"True",
62
+ "class":"HFModel"},
63
+
64
+ { "name":"SGPT-125M",
65
+ "model":"Muennighoff/SGPT-125M-weightedmean-nli-bitfit",
66
+ "fork_url":"https://github.com/taskswithcode/sgpt",
67
+ "orig_author_url":"https://github.com/Muennighoff",
68
+ "orig_author":"Niklas Muennighoff",
69
+ "sota_info": {
70
+ "task":"#1 in multiple information retrieval & search tasks(smaller variant)",
71
+ "sota_link":"https://paperswithcode.com/paper/sgpt-gpt-sentence-embeddings-for-semantic"
72
+ },
73
+ "paper_url":"https://arxiv.org/abs/2202.08904v5",
74
+ "mark":"True",
75
+ "class":"SGPTModel"},
76
+ { "name":"SIMCSE-base" ,
77
+ "model":"princeton-nlp/sup-simcse-roberta-base",
78
+ "fork_url":"https://github.com/taskswithcode/SimCSE",
79
+ "orig_author_url":"https://github.com/princeton-nlp",
80
+ "orig_author":"Princeton Natural Language Processing",
81
+ "sota_info": {
82
+ "task":"Within top 10 in multiple semantic textual similarity tasks(smaller variant)",
83
+ "sota_link":"https://paperswithcode.com/paper/simcse-simple-contrastive-learning-of"
84
+ },
85
+ "paper_url":"https://arxiv.org/abs/2104.08821v4",
86
+ "mark":"True",
87
+ "class":"SimCSEModel","sota_link":"https://paperswithcode.com/sota/semantic-textual-similarity-on-sick"},
88
+ { "name":"GPT-3-175B (text-similarity-davinci-001)" ,
89
+ "model":"text-similarity-davinci-001",
90
+ "fork_url":"https://openai.com/api/",
91
+ "orig_author_url":"https://openai.com/api/",
92
+ "orig_author":"OpenAI",
93
+ "sota_info": {
94
+ "task":"GPT-3 achieves strong zero-shot and few-shot performance on many NLP datasets etc.",
95
+ "sota_link":"https://paperswithcode.com/method/gpt-3"
96
+ },
97
+ "paper_url":"https://arxiv.org/abs/2005.14165v4",
98
+ "mark":"True",
99
+ "custom_load":"False",
100
+ "Note":"Custom file upload requires OpenAI API access to create embeddings. For API access, use this link ",
101
+ "alt_url":"https://openai.com/api/",
102
+ "class":"OpenAIModel","sota_link":"https://arxiv.org/abs/2005.14165v4"},
103
+ { "name":"GPT-3-6.7B (text-similarity-curie-001)" ,
104
+ "model":"text-similarity-curie-001",
105
+ "fork_url":"https://openai.com/api/",
106
+ "orig_author_url":"https://openai.com/api/",
107
+ "orig_author":"OpenAI",
108
+ "sota_info": {
109
+ "task":"GPT-3 achieves strong zero-shot and few-shot performance on many NLP datasets etc.",
110
+ "sota_link":"https://paperswithcode.com/method/gpt-3"
111
+ },
112
+ "paper_url":"https://arxiv.org/abs/2005.14165v4",
113
+ "mark":"True",
114
+ "custom_load":"False",
115
+ "Note":"Custom file upload requires OpenAI API access to create embeddings. For API access, use this link ",
116
+ "alt_url":"https://openai.com/api/",
117
+ "class":"OpenAIModel","sota_link":"https://arxiv.org/abs/2005.14165v4"},
118
+ { "name":"GPT-3-1.3B (text-similarity-babbage-001)" ,
119
+ "model":"text-similarity-babbage-001",
120
+ "fork_url":"https://openai.com/api/",
121
+ "orig_author_url":"https://openai.com/api/",
122
+ "orig_author":"OpenAI",
123
+ "sota_info": {
124
+ "task":"GPT-3 achieves strong zero-shot and few-shot performance on many NLP datasets etc.",
125
+ "sota_link":"https://paperswithcode.com/method/gpt-3"
126
+ },
127
+ "paper_url":"https://arxiv.org/abs/2005.14165v4",
128
+ "mark":"True",
129
+ "custom_load":"False",
130
+ "Note":"Custom file upload requires OpenAI API access to create embeddings. For API access, use this link ",
131
+ "alt_url":"https://openai.com/api/",
132
+ "class":"OpenAIModel","sota_link":"https://arxiv.org/abs/2005.14165v4"},
133
+ { "name":"GPT-3-350M (text-similarity-ada-001)" ,
134
+ "model":"text-similarity-ada-001",
135
+ "fork_url":"https://openai.com/api/",
136
+ "orig_author_url":"https://openai.com/api/",
137
+ "orig_author":"OpenAI",
138
+ "sota_info": {
139
+ "task":"GPT-3 achieves strong zero-shot and few-shot performance on many NLP datasets etc.",
140
+ "sota_link":"https://paperswithcode.com/method/gpt-3"
141
+ },
142
+ "paper_url":"https://arxiv.org/abs/2005.14165v4",
143
+ "mark":"True",
144
+ "custom_load":"False",
145
+ "Note":"Custom file upload requires OpenAI API access to create embeddings. For API access, use this link ",
146
+ "alt_url":"https://openai.com/api/",
147
+ "class":"OpenAIModel","sota_link":"https://arxiv.org/abs/2005.14165v4"}
148
+
149
+
150
+ ]
imdb_sent.txt ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "A rating of ""1"" does not begin to express how dull, depressing and relentlessly bad this movie is."
2
+ Hated it with all my being. Worst movie ever. Mentally- scarred. Help me. It was that bad.TRUST ME!!!
3
+ "Long, boring, blasphemous. Never have I been so glad to see ending credits roll."
4
+ This film made John Glover a star. Alan Raimy is one of the most compelling character that I have ever seen on film. And I mean that sport.
5
+ "Were I not with friends, and so cheap, I would have walked out. It failed miserably as satire and didn't even have the redemption of camp."
6
+ For pure gothic vampire cheese nothing can compare to the Subspecies films. I highly recommend each and every one of them.
7
+ "A great film in its genre, the direction, acting, most especially the casting of the film makes it even more powerful. A must see."
8
+ "This is a terrible movie, don't waste your money on it. Don't even watch it for free. That's all I have to say."
9
+ I wouldn't rent this one even on dollar rental night.
10
+ "More suspenseful, more subtle, much, much more disturbing...."
11
+ This is a good film. This is very funny. Yet after this film there were no good Ernest films!
12
+ A touching movie. It is full of emotions and wonderful acting. I could have sat through it a second time.
13
+ "Great movie - especially the music - Etta James - ""At Last"". This speaks volumes when you have finally found that special someone."
14
+ If you've ever had a mad week-end out with your mates then you'll appreciate this film. Excellent fun and a laugh a minute.
15
+ "I think it's one of the greatest movies which are ever made, and I've seen many... The book is better, but it's still a very good movie!"
16
+ Brilliant and moving performances by Tom Courtenay and Peter Finch.
17
+ The characters are unlikeable and the script is awful. It's a waste of the talents of Deneuve and Auteuil.
18
+ You've got to be kidding. This movie sucked for the sci-fi fans. I would only recommend watching this only if you think Armageddon was good.
19
+ Ten minutes of people spewing gallons of pink vomit. Recurring scenes of enormous piles of dog excrement - need one say more???
20
+ "As usual, Sean Connery does a great job. Lawrence Fishburn is good, but I have a hard time not seeing him as Ike Turner."
21
+ This movie is terrible but it has some good effects.
22
+ You'd better choose Paul Verhoeven's even if you have watched it.
23
+ "Brilliant. Ranks along with Citizen Kane, The Matrix and Godfathers. Must see, at least for basset in her early days. Watch it."
24
+ "I don't know why I like this movie so well, but I never get tired of watching it."
25
+ The one-liners fly so fast in this movie that you can watch it over and over and still catch new ones. By far one of the best of this genre.
26
+ "Don't waste your time and money on it. It's not quite as bad as ""Adrenalin"", by the same director but that's not saying much."
27
+ "Read the book, forget the movie!"
28
+ This is a great movie. Too bad it is not available on home video.
29
+ "Very intelligent language usage of Ali, which you musn't miss! In one word: (eeh sentence...) Wicked, so keep it real and pass it on!"
30
+ Primary plot!Primary direction!Poor interpretation.
31
+ "If you like Pauly Shore, you'll love Son in Law. If you hate Pauly Shore, then, well...I liked it!"
32
+ Just love the interplay between two great characters of stage & screen - Veidt & Barrymore
33
+ "This movie will always be a Broadway and Movie classic, as long as there are still people who sing, dance, and act."
34
+ This is the greatest movie ever. If you have written it off with out ever seeing it. You must give it a second try.
35
+ "What a script, what a story, what a mess!"
36
+ "I caught this film late at night on HBO. Talk about wooden acting, unbelievable plot, et al. Very little going in its favor. Skip it."
37
+ This is without a doubt the worst movie I have ever seen. It is not funny. It is not interesting and should not have been made.
38
+ Ming The Merciless does a little Bardwork and a movie most foul!
39
+ This is quite possibly the worst sequel ever made. The script is unfunny and the acting stinks. The exact opposite of the original.
40
+ "This is the definitive movie version of Hamlet. Branagh cuts nothing, but there are no wasted moments."
41
+ My favorite movie. What a great story this really was. I'd just like to be able to buy a copy of it but this does not seem possible.
42
+ "Comment this movie is impossible. Is terrible, very improbable, bad interpretation e direction. Not look!!!!!"
43
+ "Brilliant movie. The drawings were just amazing. Too bad it ended before it begun. I´ve waited 21 years for a sequel, but nooooo!!!"
44
+ a mesmerizing film that certainly keeps your attention... Ben Daniels is fascinating (and courageous) to watch.
45
+ "This is a very cool movie. The ending of the movie is a bit more defined than the play's ending, but either way it is still a good movie."
46
+ "Without a doubt, one of Tobe Hoppor's best! Epic storytellng, great special effects, and The Spacegirl (vamp me baby!)."
47
+ I hope this group of film-makers never re-unites.
48
+ Unwatchable. You can't even make it past the first three minutes. And this is coming from a huge Adam Sandler fan!!1
49
+ "One of the funniest movies made in recent years. Good characterization, plot and exceptional chemistry make this one a classic"
50
+ "Add this little gem to your list of holiday regulars. It is sweet, funny, and endearing"
51
+ "no comment - stupid movie, acting average or worse... screenplay - no sense at all... SKIP IT!"
52
+ "If you haven't seen this, it's terrible. It is pure trash. I saw this about 17 years ago, and I'm still screwed up from it."
53
+ Absolutely fantastic! Whatever I say wouldn't do this underrated movie the justice it deserves. Watch it now! FANTASTIC!
54
+ "As a big fan of Tiny Toon Adventures, I loved this movie!!! It was so funny!!! It really captured how cartoons spent their summers."
55
+ Widow hires a psychopath as a handyman. Sloppy film noir thriller which doesn't make much of its tension promising set-up. (3/10)
56
+ The Fiendish Plot of Dr. Fu Manchu (1980). This is hands down the worst film I've ever seen. What a sad way for a great comedian to go out.
57
+ "Obviously written for the stage. Lightweight but worthwhile. How can you go wrong with Ralph Richardson, Olivier and Merle Oberon."
58
+ This movie turned out to be better than I had expected it to be. Some parts were pretty funny. It was nice to have a movie with a new plot.
59
+ This movie is terrible. It's about some no brain surfin dude that inherits some company. Does Carrot Top have no shame?
60
+ Adrian Pasdar is excellent is this film. He makes a fascinating woman.
61
+ "An unfunny, unworthy picture which is an undeserving end to Peter Sellers' career. It is a pity this movie was ever made."
62
+ "The plot was really weak and confused. This is a true Oprah flick. (In Oprah's world, all men are evil and all women are victims.)"
larger_test.txt ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ do u really want me to unistall this app...whenever i open this app, u ask for review...how many times i have to give review-feedback...
2
+ I don't like how it asks to give a review everytime I open the app
3
+ Stop asking for review everytime I open the app..it's pathetic..the updated version sucks
4
+ If i already provided the review for this application but why this application is asking for reviews every time when I am opening the application so improve this feature. This feature is very irritating. Apart from that overall experience is very good.
5
+ as you guys bother me so much for the review even i gave my opinion already but every time i open the app it ask for review so i gave it 1 star , previsiousally it was 4 star.
6
+ repeatedly asking to rate the app...
7
+ Very irritating. Everytime i open app it asked for review hence giving 2 instead of 4
8
+ stop asking for ratings every time when open the app. I had rated this app 5 star but now every time app asking for give rating, its disgusting. so I'll give only one star
9
+ I swear if i see that feedback ad one more time im gonna uninstall this app and start using another one else
10
+ I'm am downgrading my rating because the app is good and I also gave it 5 satar but why I am getting unnecessary pop up to give it review please fix it
11
+ No rating ... worsted app ... please playstore delete this app
12
+ Much bad experience . when I used to open the app it requires feedback every time.
13
+ I already rated it then why always it pop up... Its irritate me a lot everytime when I open this app... Plz fix this
14
+ This app any time ask me for rating i hate this
15
+ Very Good app but asks to rate it all the time....all these popups are annoying when you are in hurry
16
+ I'm already rated this app. And now from one week and adove this app is asking for rating please solve the problem as soon as possible. Thank you
17
+ The app is too good but it send me notification again and again to rate it that's why am I giving one star to it
18
+ Constantly asks me to rate the app! So annoying.
19
+ Today again i am go to rating this app due to its ad less and best interface with good features again more
20
+ App is very disturbing .. very bad app
21
+ If I don't want to rate it's my personal choice, so why this app gives notification every single time,, it's quite frustrating therefore 1star Other wise app is best for it's work
22
+ For frustrating me every time to rate your app
23
+ Super exalent app can you pls reply to my comment how is my review so thanks to provide this app thanks
24
+ Vey bad app so disturbance ... All time get notification about rating... That allready done
25
+ Earlier I had given 5 stars to this app but even after giving review in this app, it speaks to rate now, so I removed 5 stars in edit review and put 1 star, now this app will be happy.
26
+ Every time I open the app it asking rating. I rated 4 before now I de rate to 1."
27
+ I love this app. So damn good
28
+ This app rocks!!!!!!
29
+ This app totally sucks
30
+ Wow what a useful app
31
+ I cant live without this app!
32
+ Shit! This app rocks. I can never imagine going out without using this app. So damn useful!
33
+ Elon musk is the founder of SpaceX
34
+ Parasites suck blood out of deer
35
+ A review of his conduct revealed he violated the rules everytime he downloaded movies
36
+ My god. If only I could rate this app 100 stars for its excellence
37
+ The board conducted a review and determined electons were fair
38
+ WTF!
39
+ Crossing the chasm is a great book review that is often quoted by readers
40
+ Why am I seeing everything in double like I m drink - is my vision going bad???
41
+ Expolanets keep going round and round their stars many times a day
42
+ I have recommended this app to so many friends and they love it too
43
+ The sale of electric cars has gone up since the increse in gas prices
44
+ Stable diffusion app is the rage on the internet with multiple people either downloading on the laptop and trying it or useing the web interface
45
+ OpenAI is trying to make money by exposin their NLP apps through an API
46
+ Co:here, Ai21 and other are trying to emulate OpenAIs business model and exposing NLP apps that depend on LLMs through metered APIs
47
+ Serverless GPUs have emerged as a new business model catering to end users who want to host apps
48
+ Cerberas, Sambanova are betting on models to grow larger and harder to train on traditional GPUS
49
+ Nvidia released Hopper series as a successor to A100 series
50
+ Oh my god! I am done with this app!
51
+ Oh my god! I love this sweet puppy. He rounds around the chair so many times
52
+ I plan to write a nasty review for this shitty movie
long_form_logo_with_icon.png ADDED
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers
2
+ scipy
3
+ torch
4
+ sentencepiece
5
+ openai
run.sh ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ streamlit run app.py --server.port 80 "1" "sim_app_examples.json" "sim_app_models.json"
2
+
small_test.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ machine learning
2
+ Transformers have become the staple architecture for deep learning models
3
+ NLP
4
+ Diffusion models
5
+ natural language processing
6
+ deep learning
7
+ Deep Learning
8
+ Support vector machines
9
+ random forests
10
+ probability distribution
11
+ Cross entropy loss
12
+ Kullback leibler divergence
13
+ Shannon entropy
14
+ Activation functions
15
+ ATM
16
+ deep fakes
17
+ AGI
18
+ AI
19
+ deep trouble
20
+ artificial intelligence
21
+ deep diving
22
+ artificial snow
23
+ shallow waters
24
+ deep end
25
+ RELU
26
+ sigmoid
27
+ GELU
28
+ RNN
29
+ CNN
30
+ Gaussian
text-similarity-ada-001imdb_sent_embed.json ADDED
The diff for this file is too large to render. See raw diff
 
text-similarity-ada-001larger_test_embed.json ADDED
The diff for this file is too large to render. See raw diff
 
text-similarity-ada-001small_test_embed.json ADDED
The diff for this file is too large to render. See raw diff
 
text-similarity-babbage-001imdb_sent_embed.json ADDED
The diff for this file is too large to render. See raw diff
 
text-similarity-babbage-001larger_test_embed.json ADDED
The diff for this file is too large to render. See raw diff
 
text-similarity-babbage-001small_test_embed.json ADDED
The diff for this file is too large to render. See raw diff
 
text-similarity-curie-001imdb_sent_embed.json ADDED
The diff for this file is too large to render. See raw diff
 
text-similarity-curie-001larger_test_embed.json ADDED
The diff for this file is too large to render. See raw diff
 
text-similarity-curie-001small_test_embed.json ADDED
The diff for this file is too large to render. See raw diff
 
text-similarity-davinci-001imdb_sent_embed.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5165d88bbd1b913de4d9bb82cf64d078f92552da1b4556e5f0b0cb436c2332f0
3
+ size 17274908
text-similarity-davinci-001larger_test_embed.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69a71cb2c1ce371c7372ac75ab631120ed5e508cd85c9872412f59d88b14ca0f
3
+ size 14491019
text-similarity-davinci-001small_test_embed.json ADDED
The diff for this file is too large to render. See raw diff
 
twc_clustering.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from scipy.spatial.distance import cosine
2
+ import argparse
3
+ import json
4
+ import pdb
5
+ import torch
6
+ import torch.nn.functional as F
7
+ import numpy as np
8
+ import time
9
+ from collections import OrderedDict
10
+
11
+
12
+ class TWCClustering:
13
+ def __init__(self):
14
+ print("In Zscore Clustering")
15
+
16
+ def compute_matrix(self,embeddings):
17
+ #print("Computing similarity matrix ...)")
18
+ embeddings= np.array(embeddings)
19
+ start = time.time()
20
+ vec_a = embeddings.T #vec_a shape (1024,)
21
+ vec_a = vec_a/np.linalg.norm(vec_a,axis=0) #Norm is along axis 0 - rows
22
+ vec_a = vec_a.T #vec_a shape becomes (,1024)
23
+ similarity_matrix = np.inner(vec_a,vec_a)
24
+ end = time.time()
25
+ time_val = (end-start)*1000
26
+ #print(f"Similarity matrix computation complete. Time taken:{(time_val/(1000*60)):.2f} minutes")
27
+ return similarity_matrix
28
+
29
+ def get_terms_above_threshold(self,matrix,embeddings,pivot_index,threshold):
30
+ run_index = pivot_index
31
+ picked_arr = []
32
+ while (run_index < len(embeddings)):
33
+ if (matrix[pivot_index][run_index] >= threshold):
34
+ picked_arr.append(run_index)
35
+ run_index += 1
36
+ return picked_arr
37
+
38
+ def update_picked_dict_arr(self,picked_dict,arr):
39
+ for i in range(len(arr)):
40
+ picked_dict[arr[i]] = 1
41
+
42
+ def update_picked_dict(self,picked_dict,in_dict):
43
+ for key in in_dict:
44
+ picked_dict[key] = 1
45
+
46
+ def find_pivot_subgraph(self,pivot_index,arr,matrix,threshold,strict_cluster = True):
47
+ center_index = pivot_index
48
+ center_score = 0
49
+ center_dict = {}
50
+ for i in range(len(arr)):
51
+ node_i_index = arr[i]
52
+ running_score = 0
53
+ temp_dict = {}
54
+ for j in range(len(arr)):
55
+ node_j_index = arr[j]
56
+ cosine_dist = matrix[node_i_index][node_j_index]
57
+ if ((cosine_dist < threshold) and strict_cluster):
58
+ continue
59
+ running_score += cosine_dist
60
+ temp_dict[node_j_index] = cosine_dist
61
+ if (running_score > center_score):
62
+ center_index = node_i_index
63
+ center_dict = temp_dict
64
+ center_score = running_score
65
+ sorted_d = OrderedDict(sorted(center_dict.items(), key=lambda kv: kv[1], reverse=True))
66
+ return {"pivot_index":center_index,"orig_index":pivot_index,"neighs":sorted_d}
67
+
68
+
69
+ def update_overlap_stats(self,overlap_dict,cluster_info):
70
+ arr = list(cluster_info["neighs"].keys())
71
+ for val in arr:
72
+ if (val not in overlap_dict):
73
+ overlap_dict[val] = 1
74
+ else:
75
+ overlap_dict[val] += 1
76
+
77
+ def bucket_overlap(self,overlap_dict):
78
+ bucket_dict = {}
79
+ for key in overlap_dict:
80
+ if (overlap_dict[key] not in bucket_dict):
81
+ bucket_dict[overlap_dict[key]] = 1
82
+ else:
83
+ bucket_dict[overlap_dict[key]] += 1
84
+ sorted_d = OrderedDict(sorted(bucket_dict.items(), key=lambda kv: kv[1], reverse=False))
85
+ return sorted_d
86
+
87
+ def merge_clusters(self,ref_cluster,curr_cluster):
88
+ dup_arr = ref_cluster.copy()
89
+ for j in range(len(curr_cluster)):
90
+ if (curr_cluster[j] not in dup_arr):
91
+ ref_cluster.append(curr_cluster[j])
92
+
93
+
94
+ def non_overlapped_clustering(self,matrix,embeddings,threshold,mean,std,cluster_dict):
95
+ picked_dict = {}
96
+ overlap_dict = {}
97
+ candidates = []
98
+
99
+ for i in range(len(embeddings)):
100
+ if (i in picked_dict):
101
+ continue
102
+ zscore = mean + threshold*std
103
+ arr = self.get_terms_above_threshold(matrix,embeddings,i,zscore)
104
+ candidates.append(arr)
105
+ self.update_picked_dict_arr(picked_dict,arr)
106
+
107
+ # Merge arrays to create non-overlapping sets
108
+ run_index_i = 0
109
+ while (run_index_i < len(candidates)):
110
+ ref_cluster = candidates[run_index_i]
111
+ run_index_j = run_index_i + 1
112
+ found = False
113
+ while (run_index_j < len(candidates)):
114
+ curr_cluster = candidates[run_index_j]
115
+ for k in range(len(curr_cluster)):
116
+ if (curr_cluster[k] in ref_cluster):
117
+ self.merge_clusters(ref_cluster,curr_cluster)
118
+ candidates.pop(run_index_j)
119
+ found = True
120
+ run_index_i = 0
121
+ break
122
+ if (found):
123
+ break
124
+ else:
125
+ run_index_j += 1
126
+ if (not found):
127
+ run_index_i += 1
128
+
129
+
130
+ zscore = mean + threshold*std
131
+ for i in range(len(candidates)):
132
+ arr = candidates[i]
133
+ cluster_info = self.find_pivot_subgraph(arr[0],arr,matrix,zscore,strict_cluster = False)
134
+ cluster_dict["clusters"].append(cluster_info)
135
+ return {}
136
+
137
+ def overlapped_clustering(self,matrix,embeddings,threshold,mean,std,cluster_dict):
138
+ picked_dict = {}
139
+ overlap_dict = {}
140
+
141
+ zscore = mean + threshold*std
142
+ for i in range(len(embeddings)):
143
+ if (i in picked_dict):
144
+ continue
145
+ arr = self.get_terms_above_threshold(matrix,embeddings,i,zscore)
146
+ cluster_info = self.find_pivot_subgraph(i,arr,matrix,zscore,strict_cluster = True)
147
+ self.update_picked_dict(picked_dict,cluster_info["neighs"])
148
+ self.update_overlap_stats(overlap_dict,cluster_info)
149
+ cluster_dict["clusters"].append(cluster_info)
150
+ sorted_d = self.bucket_overlap(overlap_dict)
151
+ return sorted_d
152
+
153
+
154
+ def cluster(self,output_file,texts,embeddings,threshold,clustering_type):
155
+ is_overlapped = True if clustering_type == "overlapped" else False
156
+ matrix = self.compute_matrix(embeddings)
157
+ mean = np.mean(matrix)
158
+ std = np.std(matrix)
159
+ zscores = []
160
+ inc = 0
161
+ value = mean
162
+ while (value < 1):
163
+ zscores.append({"threshold":inc,"cosine":round(value,2)})
164
+ inc += 1
165
+ value = mean + inc*std
166
+ #print("In clustering:",round(std,2),zscores)
167
+ cluster_dict = {}
168
+ cluster_dict["clusters"] = []
169
+ if (is_overlapped):
170
+ sorted_d = self.overlapped_clustering(matrix,embeddings,threshold,mean,std,cluster_dict)
171
+ else:
172
+ sorted_d = self.non_overlapped_clustering(matrix,embeddings,threshold,mean,std,cluster_dict)
173
+ curr_threshold = f"{threshold} (cosine:{mean+threshold*std:.2f})"
174
+ cluster_dict["info"] ={"mean":mean,"std":std,"current_threshold":curr_threshold,"zscores":zscores,"overlap":list(sorted_d.items())}
175
+ return cluster_dict
176
+
177
+
twc_embeddings.py ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModel, AutoTokenizer
2
+ from transformers import AutoModelForCausalLM
3
+ from scipy.spatial.distance import cosine
4
+ import argparse
5
+ import json
6
+ import pdb
7
+ import torch
8
+ import torch.nn.functional as F
9
+
10
+ def read_text(input_file):
11
+ arr = open(input_file).read().split("\n")
12
+ return arr[:-1]
13
+
14
+
15
+ class CausalLMModel:
16
+ def __init__(self):
17
+ self.model = None
18
+ self.tokenizer = None
19
+ self.debug = False
20
+ print("In CausalLMModel Constructor")
21
+
22
+ def init_model(self,model_name = None):
23
+ # Get our models - The package will take care of downloading the models automatically
24
+ # For best performance: Muennighoff/SGPT-5.8B-weightedmean-nli-bitfit
25
+ if (self.debug):
26
+ print("Init model",model_name)
27
+ # For best performance: EleutherAI/gpt-j-6B
28
+ if (model_name is None):
29
+ model_name = "EleutherAI/gpt-neo-125M"
30
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
31
+ self.model = AutoModelForCausalLM.from_pretrained(model_name)
32
+ self.model.eval()
33
+ self.prompt = 'Documents are searched to find matches with the same content.\nThe document "{}" is a good search result for "'
34
+
35
+ def compute_embeddings(self,input_file_name,input_data,is_file):
36
+ if (self.debug):
37
+ print("Computing embeddings for:", input_data[:20])
38
+ model = self.model
39
+ tokenizer = self.tokenizer
40
+
41
+ texts = read_text(input_data) if is_file == True else input_data
42
+ query = texts[0]
43
+ docs = texts[1:]
44
+
45
+ # Tokenize input texts
46
+
47
+ #print(f"Query: {query}")
48
+ scores = []
49
+ for doc in docs:
50
+ context = self.prompt.format(doc)
51
+
52
+ context_enc = tokenizer.encode(context, add_special_tokens=False)
53
+ continuation_enc = tokenizer.encode(query, add_special_tokens=False)
54
+ # Slice off the last token, as we take its probability from the one before
55
+ model_input = torch.tensor(context_enc+continuation_enc[:-1])
56
+ continuation_len = len(continuation_enc)
57
+ input_len, = model_input.shape
58
+
59
+ # [seq_len] -> [seq_len, vocab]
60
+ logprobs = torch.nn.functional.log_softmax(model(model_input)[0], dim=-1).cpu()
61
+ # [seq_len, vocab] -> [continuation_len, vocab]
62
+ logprobs = logprobs[input_len-continuation_len:]
63
+ # Gather the log probabilities of the continuation tokens -> [continuation_len]
64
+ logprobs = torch.gather(logprobs, 1, torch.tensor(continuation_enc).unsqueeze(-1)).squeeze(-1)
65
+ score = torch.sum(logprobs)
66
+ scores.append(score.tolist())
67
+ return texts,scores
68
+
69
+ def output_results(self,output_file,texts,scores,main_index = 0):
70
+ cosine_dict = {}
71
+ docs = texts[1:]
72
+ if (self.debug):
73
+ print("Total sentences",len(texts))
74
+ assert(len(scores) == len(docs))
75
+ for i in range(len(docs)):
76
+ cosine_dict[docs[i]] = scores[i]
77
+
78
+ if (self.debug):
79
+ print("Input sentence:",texts[main_index])
80
+ sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
81
+ if (self.debug):
82
+ for key in sorted_dict:
83
+ print("Document score for \"%s\" is: %.3f" % (key[:100], sorted_dict[key]))
84
+ if (output_file is not None):
85
+ with open(output_file,"w") as fp:
86
+ fp.write(json.dumps(sorted_dict,indent=0))
87
+ return sorted_dict
88
+
89
+
90
+ class SGPTQnAModel:
91
+ def __init__(self):
92
+ self.model = None
93
+ self.tokenizer = None
94
+ self.debug = False
95
+ print("In SGPT Q&A Constructor")
96
+
97
+
98
+ def init_model(self,model_name = None):
99
+ # Get our models - The package will take care of downloading the models automatically
100
+ # For best performance: Muennighoff/SGPT-5.8B-weightedmean-nli-bitfit
101
+ if (self.debug):
102
+ print("Init model",model_name)
103
+ if (model_name is None):
104
+ model_name = "Muennighoff/SGPT-125M-weightedmean-msmarco-specb-bitfit"
105
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
106
+ self.model = AutoModel.from_pretrained(model_name)
107
+ self.model.eval()
108
+ self.SPECB_QUE_BOS = self.tokenizer.encode("[", add_special_tokens=False)[0]
109
+ self.SPECB_QUE_EOS = self.tokenizer.encode("]", add_special_tokens=False)[0]
110
+
111
+ self.SPECB_DOC_BOS = self.tokenizer.encode("{", add_special_tokens=False)[0]
112
+ self.SPECB_DOC_EOS = self.tokenizer.encode("}", add_special_tokens=False)[0]
113
+
114
+
115
+ def tokenize_with_specb(self,texts, is_query):
116
+ # Tokenize without padding
117
+ batch_tokens = self.tokenizer(texts, padding=False, truncation=True)
118
+ # Add special brackets & pay attention to them
119
+ for seq, att in zip(batch_tokens["input_ids"], batch_tokens["attention_mask"]):
120
+ if is_query:
121
+ seq.insert(0, self.SPECB_QUE_BOS)
122
+ seq.append(self.SPECB_QUE_EOS)
123
+ else:
124
+ seq.insert(0, self.SPECB_DOC_BOS)
125
+ seq.append(self.SPECB_DOC_EOS)
126
+ att.insert(0, 1)
127
+ att.append(1)
128
+ # Add padding
129
+ batch_tokens = self.tokenizer.pad(batch_tokens, padding=True, return_tensors="pt")
130
+ return batch_tokens
131
+
132
+ def get_weightedmean_embedding(self,batch_tokens, model):
133
+ # Get the embeddings
134
+ with torch.no_grad():
135
+ # Get hidden state of shape [bs, seq_len, hid_dim]
136
+ last_hidden_state = self.model(**batch_tokens, output_hidden_states=True, return_dict=True).last_hidden_state
137
+
138
+ # Get weights of shape [bs, seq_len, hid_dim]
139
+ weights = (
140
+ torch.arange(start=1, end=last_hidden_state.shape[1] + 1)
141
+ .unsqueeze(0)
142
+ .unsqueeze(-1)
143
+ .expand(last_hidden_state.size())
144
+ .float().to(last_hidden_state.device)
145
+ )
146
+
147
+ # Get attn mask of shape [bs, seq_len, hid_dim]
148
+ input_mask_expanded = (
149
+ batch_tokens["attention_mask"]
150
+ .unsqueeze(-1)
151
+ .expand(last_hidden_state.size())
152
+ .float()
153
+ )
154
+
155
+ # Perform weighted mean pooling across seq_len: bs, seq_len, hidden_dim -> bs, hidden_dim
156
+ sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded * weights, dim=1)
157
+ sum_mask = torch.sum(input_mask_expanded * weights, dim=1)
158
+
159
+ embeddings = sum_embeddings / sum_mask
160
+
161
+ return embeddings
162
+
163
+ def compute_embeddings(self,input_file_name,input_data,is_file):
164
+ if (self.debug):
165
+ print("Computing embeddings for:", input_data[:20])
166
+ model = self.model
167
+ tokenizer = self.tokenizer
168
+
169
+ texts = read_text(input_data) if is_file == True else input_data
170
+
171
+ queries = [texts[0]]
172
+ docs = texts[1:]
173
+ query_embeddings = self.get_weightedmean_embedding(self.tokenize_with_specb(queries, is_query=True), self.model)
174
+ doc_embeddings = self.get_weightedmean_embedding(self.tokenize_with_specb(docs, is_query=False), self.model)
175
+ return texts,(query_embeddings,doc_embeddings)
176
+
177
+
178
+
179
+ def output_results(self,output_file,texts,embeddings,main_index = 0):
180
+ # Calculate cosine similarities
181
+ # Cosine similarities are in [-1, 1]. Higher means more similar
182
+ query_embeddings = embeddings[0]
183
+ doc_embeddings = embeddings[1]
184
+ cosine_dict = {}
185
+ queries = [texts[0]]
186
+ docs = texts[1:]
187
+ if (self.debug):
188
+ print("Total sentences",len(texts))
189
+ for i in range(len(docs)):
190
+ cosine_dict[docs[i]] = 1 - cosine(query_embeddings[0], doc_embeddings[i])
191
+
192
+ if (self.debug):
193
+ print("Input sentence:",texts[main_index])
194
+ sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
195
+ if (self.debug):
196
+ for key in sorted_dict:
197
+ print("Cosine similarity with \"%s\" is: %.3f" % (key, sorted_dict[key]))
198
+ if (output_file is not None):
199
+ with open(output_file,"w") as fp:
200
+ fp.write(json.dumps(sorted_dict,indent=0))
201
+ return sorted_dict
202
+
203
+
204
+ class SimCSEModel:
205
+ def __init__(self):
206
+ self.model = None
207
+ self.tokenizer = None
208
+ self.debug = False
209
+ print("In SimCSE constructor")
210
+
211
+ def init_model(self,model_name = None):
212
+ if (model_name == None):
213
+ model_name = "princeton-nlp/sup-simcse-roberta-large"
214
+ #self.model = SimCSE(model_name)
215
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
216
+ self.model = AutoModel.from_pretrained(model_name)
217
+
218
+ def compute_embeddings(self,input_file_name,input_data,is_file):
219
+ texts = read_text(input_data) if is_file == True else input_data
220
+ inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
221
+ with torch.no_grad():
222
+ embeddings = self.model(**inputs, output_hidden_states=True, return_dict=True).pooler_output
223
+ return texts,embeddings
224
+
225
+ def output_results(self,output_file,texts,embeddings,main_index = 0):
226
+ # Calculate cosine similarities
227
+ # Cosine similarities are in [-1, 1]. Higher means more similar
228
+ cosine_dict = {}
229
+ #print("Total sentences",len(texts))
230
+ for i in range(len(texts)):
231
+ cosine_dict[texts[i]] = 1 - cosine(embeddings[main_index], embeddings[i])
232
+
233
+ #print("Input sentence:",texts[main_index])
234
+ sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
235
+ if (self.debug):
236
+ for key in sorted_dict:
237
+ print("Cosine similarity with \"%s\" is: %.3f" % (key, sorted_dict[key]))
238
+ if (output_file is not None):
239
+ with open(output_file,"w") as fp:
240
+ fp.write(json.dumps(sorted_dict,indent=0))
241
+ return sorted_dict
242
+
243
+
244
+
245
+ class SGPTModel:
246
+ def __init__(self):
247
+ self.model = None
248
+ self.tokenizer = None
249
+ self.debug = False
250
+ print("In SGPT Constructor")
251
+
252
+
253
+ def init_model(self,model_name = None):
254
+ # Get our models - The package will take care of downloading the models automatically
255
+ # For best performance: Muennighoff/SGPT-5.8B-weightedmean-nli-bitfit
256
+ if (self.debug):
257
+ print("Init model",model_name)
258
+ if (model_name is None):
259
+ model_name = "Muennighoff/SGPT-125M-weightedmean-nli-bitfit"
260
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
261
+ self.model = AutoModel.from_pretrained(model_name)
262
+ #self.tokenizer = AutoTokenizer.from_pretrained("Muennighoff/SGPT-1.3B-weightedmean-msmarco-specb-bitfit")
263
+ #self.model = AutoModel.from_pretrained("Muennighoff/SGPT-1.3B-weightedmean-msmarco-specb-bitfit")
264
+ #self.tokenizer = AutoTokenizer.from_pretrained("Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit")
265
+ #self.model = AutoModel.from_pretrained("Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit")
266
+ # Deactivate Dropout (There is no dropout in the above models so it makes no difference here but other SGPT models may have dropout)
267
+ self.model.eval()
268
+
269
+ def compute_embeddings(self,input_file_name,input_data,is_file):
270
+ if (self.debug):
271
+ print("Computing embeddings for:", input_data[:20])
272
+ model = self.model
273
+ tokenizer = self.tokenizer
274
+
275
+ texts = read_text(input_data) if is_file == True else input_data
276
+
277
+ # Tokenize input texts
278
+ batch_tokens = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
279
+
280
+ # Get the embeddings
281
+ with torch.no_grad():
282
+ # Get hidden state of shape [bs, seq_len, hid_dim]
283
+ last_hidden_state = model(**batch_tokens, output_hidden_states=True, return_dict=True).last_hidden_state
284
+
285
+ # Get weights of shape [bs, seq_len, hid_dim]
286
+ weights = (
287
+ torch.arange(start=1, end=last_hidden_state.shape[1] + 1)
288
+ .unsqueeze(0)
289
+ .unsqueeze(-1)
290
+ .expand(last_hidden_state.size())
291
+ .float().to(last_hidden_state.device)
292
+ )
293
+
294
+ # Get attn mask of shape [bs, seq_len, hid_dim]
295
+ input_mask_expanded = (
296
+ batch_tokens["attention_mask"]
297
+ .unsqueeze(-1)
298
+ .expand(last_hidden_state.size())
299
+ .float()
300
+ )
301
+
302
+ # Perform weighted mean pooling across seq_len: bs, seq_len, hidden_dim -> bs, hidden_dim
303
+ sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded * weights, dim=1)
304
+ sum_mask = torch.sum(input_mask_expanded * weights, dim=1)
305
+
306
+ embeddings = sum_embeddings / sum_mask
307
+ return texts,embeddings
308
+
309
+ def output_results(self,output_file,texts,embeddings,main_index = 0):
310
+ # Calculate cosine similarities
311
+ # Cosine similarities are in [-1, 1]. Higher means more similar
312
+ cosine_dict = {}
313
+ if (self.debug):
314
+ print("Total sentences",len(texts))
315
+ for i in range(len(texts)):
316
+ cosine_dict[texts[i]] = 1 - cosine(embeddings[main_index], embeddings[i])
317
+
318
+ if (self.debug):
319
+ print("Input sentence:",texts[main_index])
320
+ sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
321
+ if (self.debug):
322
+ for key in sorted_dict:
323
+ print("Cosine similarity with \"%s\" is: %.3f" % (key, sorted_dict[key]))
324
+ if (output_file is not None):
325
+ with open(output_file,"w") as fp:
326
+ fp.write(json.dumps(sorted_dict,indent=0))
327
+ return sorted_dict
328
+
329
+
330
+
331
+
332
+
333
+ class HFModel:
334
+ def __init__(self):
335
+ self.model = None
336
+ self.tokenizer = None
337
+ self.debug = False
338
+ print("In HF Constructor")
339
+
340
+
341
+ def init_model(self,model_name = None):
342
+ # Get our models - The package will take care of downloading the models automatically
343
+ # For best performance: Muennighoff/SGPT-5.8B-weightedmean-nli-bitfit
344
+ #print("Init model",model_name)
345
+ if (model_name is None):
346
+ model_name = "sentence-transformers/all-MiniLM-L6-v2"
347
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
348
+ self.model = AutoModel.from_pretrained(model_name)
349
+ self.model.eval()
350
+
351
+ def mean_pooling(self,model_output, attention_mask):
352
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
353
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
354
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
355
+
356
+ def compute_embeddings(self,input_file_name,input_data,is_file):
357
+ #print("Computing embeddings for:", input_data[:20])
358
+ model = self.model
359
+ tokenizer = self.tokenizer
360
+
361
+ texts = read_text(input_data) if is_file == True else input_data
362
+
363
+ encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
364
+
365
+ # Compute token embeddings
366
+ with torch.no_grad():
367
+ model_output = model(**encoded_input)
368
+
369
+ # Perform pooling
370
+ sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
371
+
372
+ # Normalize embeddings
373
+ sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
374
+
375
+ return texts,sentence_embeddings
376
+
377
+ def output_results(self,output_file,texts,embeddings,main_index = 0):
378
+ # Calculate cosine similarities
379
+ # Cosine similarities are in [-1, 1]. Higher means more similar
380
+ cosine_dict = {}
381
+ #print("Total sentences",len(texts))
382
+ for i in range(len(texts)):
383
+ cosine_dict[texts[i]] = 1 - cosine(embeddings[main_index], embeddings[i])
384
+
385
+ #print("Input sentence:",texts[main_index])
386
+ sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
387
+ if (self.debug):
388
+ for key in sorted_dict:
389
+ print("Cosine similarity with \"%s\" is: %.3f" % (key, sorted_dict[key]))
390
+ if (output_file is not None):
391
+ with open(output_file,"w") as fp:
392
+ fp.write(json.dumps(sorted_dict,indent=0))
393
+ return sorted_dict
394
+
395
+
396
+
397
+ if __name__ == '__main__':
398
+ parser = argparse.ArgumentParser(description='SGPT model for sentence embeddings ',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
399
+ parser.add_argument('-input', action="store", dest="input",required=True,help="Input file with sentences")
400
+ parser.add_argument('-output', action="store", dest="output",default="output.txt",help="Output file with results")
401
+ parser.add_argument('-model', action="store", dest="model",default="sentence-transformers/all-MiniLM-L6-v2",help="model name")
402
+
403
+ results = parser.parse_args()
404
+ obj = HFModel()
405
+ obj.init_model(results.model)
406
+ texts, embeddings = obj.compute_embeddings(results.input,results.input,is_file = True)
407
+ results = obj.output_results(results.output,texts,embeddings)
twc_openai_embeddings.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from scipy.spatial.distance import cosine
2
+ import argparse
3
+ import json
4
+ import os
5
+ import openai
6
+ import pdb
7
+
8
+ def read_text(input_file):
9
+ arr = open(input_file).read().split("\n")
10
+ return arr[:-1]
11
+
12
+
13
+ class OpenAIModel:
14
+ def __init__(self):
15
+ self.debug = False
16
+ self.model_name = None
17
+ self.skip_key = True
18
+ print("In OpenAI API constructor")
19
+
20
+
21
+ def init_model(self,model_name = None):
22
+ #print("OpenAI: Init model",model_name)
23
+ openai.api_key = os.getenv("OPENAI_API_KEY")
24
+ if (openai.api_key == None):
25
+ openai.api_key = ""
26
+ print("API key not set")
27
+
28
+ if (len(openai.api_key) == 0 and not self.skip_key):
29
+ print("Open API key not set")
30
+
31
+ if (model_name is None):
32
+ self.model_name = "text-similarity-ada-001"
33
+ else:
34
+ self.model_name = model_name
35
+ print("OpenAI: Init model complete",model_name)
36
+
37
+
38
+ def compute_embeddings(self,input_file_name,input_data,is_file):
39
+ if (len(openai.api_key) == 0 and not self.skip_key):
40
+ print("Open API key not set")
41
+ return [],[]
42
+ #print("In compute embeddings after key check")
43
+ in_file = self.model_name + '.'.join(input_file_name.split('.')[:-1]) + "_embed.json"
44
+ cached = False
45
+ try:
46
+ fp = open(in_file)
47
+ cached = True
48
+ embeddings = json.load(fp)
49
+ print("Using cached embeddings")
50
+ except:
51
+ pass
52
+
53
+ texts = read_text(input_data) if is_file == True else input_data
54
+ if (not cached):
55
+ print(f"Computing embeddings for {input_file_name} and model {self.model_name}")
56
+ response = openai.Embedding.create(
57
+ input=texts,
58
+ model=self.model_name
59
+ )
60
+ embeddings = []
61
+ for i in range(len(response['data'])):
62
+ embeddings.append(response['data'][i]['embedding'])
63
+ if (not cached):
64
+ with open(in_file,"w") as fp:
65
+ json.dump(embeddings,fp)
66
+ return texts,embeddings
67
+
68
+ def output_results(self,output_file,texts,embeddings,main_index = 0):
69
+ if (len(openai.api_key) == 0 and not self.skip_key):
70
+ print("Open API key not set")
71
+ return {}
72
+ #print("In output results after key check")
73
+ # Calculate cosine similarities
74
+ # Cosine similarities are in [-1, 1]. Higher means more similar
75
+ cosine_dict = {}
76
+ #print("Total sentences",len(texts))
77
+ for i in range(len(texts)):
78
+ cosine_dict[texts[i]] = 1 - cosine(embeddings[main_index], embeddings[i])
79
+
80
+ #print("Input sentence:",texts[main_index])
81
+ sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
82
+ if (self.debug):
83
+ for key in sorted_dict:
84
+ print("Cosine similarity with \"%s\" is: %.3f" % (key, sorted_dict[key]))
85
+ if (output_file is not None):
86
+ with open(output_file,"w") as fp:
87
+ fp.write(json.dumps(sorted_dict,indent=0))
88
+ return sorted_dict
89
+
90
+
91
+
92
+ if __name__ == '__main__':
93
+ parser = argparse.ArgumentParser(description='OpenAI model for sentence embeddings ',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
94
+ parser.add_argument('-input', action="store", dest="input",required=True,help="Input file with sentences")
95
+ parser.add_argument('-output', action="store", dest="output",default="output.txt",help="Output file with results")
96
+ parser.add_argument('-model', action="store", dest="model",default="text-similarity-ada-001",help="model name")
97
+
98
+ results = parser.parse_args()
99
+ obj = OpenAIModel()
100
+ obj.init_model(results.model)
101
+ texts, embeddings = obj.compute_embeddings(results.input,is_file = True)
102
+ results = obj.output_results(results.output,texts,embeddings)