semantic_clustering

Runtime error

App Files Files Community

Jolie80

taskswithcode commited on Nov 19, 2022

Commit

cebe313

0 Parent(s):

Duplicate from taskswithcode/semantic_clustering

Browse files

Co-authored-by: RA <[email protected]>

Files changed (27) hide show

.gitattributes +33 -0
README.md +14 -0
app.py +307 -0
clus_app_clustypes.json +4 -0
clus_app_examples.json +5 -0
clus_app_models.json +150 -0
imdb_sent.txt +62 -0
larger_test.txt +52 -0
long_form_logo_with_icon.png +0 -0
requirements.txt +5 -0
run.sh +2 -0
small_test.txt +30 -0
text-similarity-ada-001imdb_sent_embed.json +0 -0
text-similarity-ada-001larger_test_embed.json +0 -0
text-similarity-ada-001small_test_embed.json +0 -0
text-similarity-babbage-001imdb_sent_embed.json +0 -0
text-similarity-babbage-001larger_test_embed.json +0 -0
text-similarity-babbage-001small_test_embed.json +0 -0
text-similarity-curie-001imdb_sent_embed.json +0 -0
text-similarity-curie-001larger_test_embed.json +0 -0
text-similarity-curie-001small_test_embed.json +0 -0
text-similarity-davinci-001imdb_sent_embed.json +3 -0
text-similarity-davinci-001larger_test_embed.json +3 -0
text-similarity-davinci-001small_test_embed.json +0 -0
twc_clustering.py +177 -0
twc_embeddings.py +407 -0
twc_openai_embeddings.py +102 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,33 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+text-similarity-davinci-001imdb_sent_embed.json filter=lfs diff=lfs merge=lfs -text
+text-similarity-davinci-001larger_test_embed.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: Semantic Clustering
+emoji: 🏃
+colorFrom: indigo
+colorTo: purple
+sdk: streamlit
+sdk_version: 1.10.0
+app_file: app.py
+pinned: false
+license: mit
+duplicated_from: taskswithcode/semantic_clustering
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,307 @@

+import time
+import sys
+import streamlit as st
+import string
+from io import StringIO
+import pdb
+import json
+from twc_embeddings import HFModel,SimCSEModel,SGPTModel,CausalLMModel,SGPTQnAModel
+from twc_openai_embeddings import OpenAIModel
+from twc_clustering import TWCClustering
+import torch
+import requests
+import socket
+MAX_INPUT = 100
+SEM_SIMILARITY="1"
+DOC_RETRIEVAL="2"
+CLUSTERING="3"
+use_case = {"1":"Finding similar phrases/sentences","2":"Retrieving semantically matching information to a query. It may not be a factual match","3":"Clustering"}
+use_case_url = {"1":"https://huggingface.co/spaces/taskswithcode/semantic_similarity","2":"https://huggingface.co/spaces/taskswithcode/semantic_search","3":""}
+from transformers import BertTokenizer, BertForMaskedLM
+APP_NAME = "hf/semantic_clustering"
+INFO_URL = "https://www.taskswithcode.com/stats/"
+def get_views(action):
+    ret_val = 0
+    hostname = socket.gethostname()
+    ip_address = socket.gethostbyname(hostname)
+    if ("view_count" not in st.session_state):
+        try:
+           app_info = {'name': APP_NAME,"action":action,"host":hostname,"ip":ip_address}
+           res = requests.post(INFO_URL, json = app_info).json()
+           print(res)
+           data = res["count"]
+        except:
+           data = 0
+        ret_val = data
+        st.session_state["view_count"] = data
+    else:
+        ret_val = st.session_state["view_count"]
+        if (action != "init"):
+           app_info = {'name': APP_NAME,"action":action,"host":hostname,"ip":ip_address}
+           res = requests.post(INFO_URL, json = app_info).json()
+    return "{:,}".format(ret_val)
+def construct_model_info_for_display(model_names):
+    options_arr  = []
+    markdown_str = f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\"><br/><b>Models evaluated ({len(model_names)})</b><br/><i>The selected models satisfy one or more of the following (1) state-of-the-art (2) the most downloaded models on Hugging Face (3) Large Language Models (e.g. GPT-3)</i></div>"
+    markdown_str += f"<div style=\"font-size:2px; color: #2f2f2f; text-align: left\"><br/></div>"
+    for node in model_names:
+        options_arr .append(node["name"])
+        if (node["mark"] == "True"):
+            markdown_str += f"<div style=\"font-size:16px; color: #5f5f5f; text-align: left\">&nbsp;•&nbsp;Model:&nbsp;<a href=\'{node['paper_url']}\' target='_blank'>{node['name']}</a><br/>&nbsp;&nbsp;&nbsp;&nbsp;Code released by:&nbsp;<a href=\'{node['orig_author_url']}\' target='_blank'>{node['orig_author']}</a><br/>&nbsp;&nbsp;&nbsp;&nbsp;Model info:&nbsp;<a href=\'{node['sota_info']['sota_link']}\' target='_blank'>{node['sota_info']['task']}</a></div>"
+            if ("Note" in node):
+                markdown_str += f"<div style=\"font-size:16px; color: #a91212; text-align: left\">&nbsp;&nbsp;&nbsp;&nbsp;{node['Note']}<a href=\'{node['alt_url']}\' target='_blank'>link</a></div>"
+            markdown_str += "<div style=\"font-size:16px; color: #5f5f5f; text-align: left\"><br/></div>"
+    markdown_str += "<div style=\"font-size:12px; color: #9f9f9f; text-align: left\"><b>Note:</b><br/>•&nbsp;Uploaded files are loaded into non-persistent memory for the duration of the computation. They are not cached</div>"
+    limit = "{:,}".format(MAX_INPUT)
+    markdown_str += f"<div style=\"font-size:12px; color: #9f9f9f; text-align: left\">•&nbsp;User uploaded file has a maximum limit of {limit} sentences.</div>"
+    return options_arr,markdown_str
+st.set_page_config(page_title='TWC - Compare popular/state-of-the-art models for semantic clustering using sentence embeddings', page_icon="logo.jpg", layout='centered', initial_sidebar_state='auto',
+            menu_items={
+             'About': 'This app was created by taskswithcode. http://taskswithcode.com'
+              })
+col,pad = st.columns([85,15])
+with col:
+    st.image("long_form_logo_with_icon.png")
+@st.experimental_memo
+def load_model(model_name,model_class,load_model_name):
+    try:
+        ret_model = None
+        obj_class = globals()[model_class]
+        ret_model = obj_class()
+        ret_model.init_model(load_model_name)
+        assert(ret_model is not None)
+    except Exception as e:
+        st.error(f"Unable to load model class:{model_class} model_name: {model_name} load_model_name: {load_model_name}   {str(e)}")
+        pass
+    return ret_model
+@st.experimental_memo
+def cached_compute_similarity(input_file_name,sentences,_model,model_name,threshold,_cluster,clustering_type):
+    texts,embeddings = _model.compute_embeddings(input_file_name,sentences,is_file=False)
+    results = _cluster.cluster(None,texts,embeddings,threshold,clustering_type)
+    return results
+def uncached_compute_similarity(input_file_name,sentences,_model,model_name,threshold,cluster,clustering_type):
+    with st.spinner('Computing vectors for sentences'):
+        texts,embeddings = _model.compute_embeddings(input_file_name,sentences,is_file=False)
+        results = cluster.cluster(None,texts,embeddings,threshold,clustering_type)
+    #st.success("Similarity computation complete")
+    return results
+DEFAULT_HF_MODEL = "sentence-transformers/paraphrase-MiniLM-L6-v2"
+def get_model_info(model_names,model_name):
+    for node in model_names:
+        if (model_name == node["name"]):
+            return node,model_name
+    return get_model_info(model_names,DEFAULT_HF_MODEL)
+def run_test(model_names,model_name,input_file_name,sentences,display_area,threshold,user_uploaded,custom_model,clustering_type):
+    display_area.text("Loading model:" + model_name)
+    #Note. model_name may get mapped to new name in the call below for custom models
+    orig_model_name = model_name
+    model_info,model_name = get_model_info(model_names,model_name)
+    if (model_name != orig_model_name):
+        load_model_name  = orig_model_name
+    else:
+        load_model_name = model_info["model"]
+    if ("Note" in model_info):
+        fail_link = f"{model_info['Note']} [link]({model_info['alt_url']})"
+        display_area.write(fail_link)
+    if (user_uploaded and "custom_load" in model_info and model_info["custom_load"] == "False"):
+        fail_link = f"{model_info['Note']} [link]({model_info['alt_url']})"
+        display_area.write(fail_link)
+        return {"error":fail_link}
+    model = load_model(model_name,model_info["class"],load_model_name)
+    display_area.text("Model " + model_name  + " load complete")
+    try:
+            if (user_uploaded):
+                results = uncached_compute_similarity(input_file_name,sentences,model,model_name,threshold,st.session_state["cluster"],clustering_type)
+            else:
+                display_area.text("Computing vectors for sentences")
+                results = cached_compute_similarity(input_file_name,sentences,model,model_name,threshold,st.session_state["cluster"],clustering_type)
+                display_area.text("Similarity computation complete")
+            return results
+    except Exception as e:
+        st.error("Some error occurred during prediction" + str(e))
+        st.stop()
+    return {}
+def display_results(orig_sentences,results,response_info,app_mode,model_name):
+    main_sent = f"<div style=\"font-size:14px; color: #2f2f2f; text-align: left\">{response_info}<br/><br/></div>"
+    main_sent += f"<div style=\"font-size:14px; color: #2f2f2f; text-align: left\">Showing results for model:&nbsp;<b>{model_name}</b></div>"
+    score_text = "cosine distance"
+    main_sent += f"<div style=\"font-size:14px; color: #6f6f6f; text-align: left\">Clustering by {score_text}.&nbsp;<b>{len(results['clusters'])} clusters</b>.&nbsp;&nbsp;mean:{results['info']['mean']:.2f};&nbsp;std:{results['info']['std']:.2f};&nbsp;current threshold:{results['info']['current_threshold']}<br/>Threshold hints:{str(results['info']['zscores'])}<br/>Overlap stats(overlap,freq):{str(results['info']['overlap'])}</div>"
+    body_sent = []
+    download_data = {}
+    for i in range(len(results["clusters"])):
+        pivot_index = results["clusters"][i]["pivot_index"]
+        pivot_sent = orig_sentences[pivot_index]
+        pivot_index +=  1
+        d_cluster = {}
+        download_data[i + 1] = d_cluster
+        d_cluster["pivot"] = {"pivot_index":pivot_index,"sent":pivot_sent,"children":{}}
+        body_sent.append(f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\">{pivot_index}]&nbsp;{pivot_sent}&nbsp;<b><i>(Cluster {i+1})</i></b>&nbsp;&nbsp;</div>")
+        neighs_dict = results["clusters"][i]["neighs"]
+        for key in neighs_dict:
+            cosine_dist = neighs_dict[key]
+            child_index = key
+            sentence = orig_sentences[child_index]
+            child_index += 1
+            body_sent.append(f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\">{child_index}]&nbsp;{sentence}&nbsp;&nbsp;&nbsp;<b>{cosine_dist:.2f}</b></div>")
+            d_cluster["pivot"]["children"][sentence] = f"{cosine_dist:.2f}"
+        body_sent.append(f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\">&nbsp;</div>")
+    main_sent = main_sent + "\n" + '\n'.join(body_sent)
+    st.markdown(main_sent,unsafe_allow_html=True)
+    st.session_state["download_ready"] = json.dumps(download_data,indent=4)
+    get_views("submit")
+def init_session():
+    if ("model_name" not in st.session_state):
+        st.session_state["model_name"] = "ss_test"
+        st.session_state["download_ready"] = None
+        st.session_state["model_name"] = "ss_test"
+        st.session_state["threshold"] = 1.5
+        st.session_state["file_name"] = "default"
+        st.session_state["overlapped"] = "overlapped"
+        st.session_state["cluster"] = TWCClustering()
+    else:
+        print("Skipping init session")
+def app_main(app_mode,example_files,model_name_files,clus_types):
+  init_session()
+  with open(example_files) as fp:
+        example_file_names = json.load(fp)
+  with open(model_name_files) as fp:
+        model_names = json.load(fp)
+  with open(clus_types) as fp:
+        cluster_types = json.load(fp)
+  curr_use_case = use_case[app_mode].split(".")[0]
+  st.markdown("<h5 style='text-align: center;'>Compare popular/state-of-the-art models for semantic clustering using sentence embeddings</h5>", unsafe_allow_html=True)
+  st.markdown(f"<p style='font-size:14px; color: #4f4f4f; text-align: center'><i>Or compare your own model with state-of-the-art/popular models</p>", unsafe_allow_html=True)
+  st.markdown(f"<div style='color: #4f4f4f; text-align: left'>Use cases for sentence embeddings<br/>&nbsp;&nbsp;&nbsp;•&nbsp;&nbsp;<a href=\'{use_case_url['1']}\' target='_blank'>{use_case['1']}</a><br/>&nbsp;&nbsp;&nbsp;•&nbsp;&nbsp;<a href=\'{use_case_url['2']}\' target='_blank'>{use_case['2']}</a><br/>&nbsp;&nbsp;&nbsp;•&nbsp;&nbsp;{use_case['3']}<br/><i>This app illustrates <b>'{curr_use_case}'</b> use case</i></div>", unsafe_allow_html=True)
+  st.markdown(f"<div style='color: #9f9f9f; text-align: right'>views:&nbsp;{get_views('init')}</div>", unsafe_allow_html=True)
+  try:
+      with st.form('twc_form'):
+        step1_line = "Upload text file(one sentence in a line) or choose an example text file below"
+        if (app_mode ==  DOC_RETRIEVAL):
+            step1_line += ". The first line is treated as the query"
+        uploaded_file = st.file_uploader(step1_line, type=".txt")
+        selected_file_index = st.selectbox(label=f'Example files ({len(example_file_names)})',
+                    options = list(dict.keys(example_file_names)), index=0,  key = "twc_file")
+        st.write("")
+        options_arr,markdown_str = construct_model_info_for_display(model_names)
+        selection_label = 'Select Model'
+        selected_model = st.selectbox(label=selection_label,
+                    options = options_arr, index=0,  key = "twc_model")
+        st.write("")
+        custom_model_selection = st.text_input("Model not listed above? Type any Hugging Face sentence embedding model name ", "",key="custom_model")
+        hf_link_str = "<div style=\"font-size:12px; color: #9f9f9f; text-align: left\"><a href='https://huggingface.co/models?pipeline_tag=sentence-similarity' target = '_blank'>List of Hugging Face sentence embedding models</a><br/><br/><br/></div>"
+        st.markdown(hf_link_str, unsafe_allow_html=True)
+        threshold = st.number_input('Choose a zscore threshold (number of std devs from mean)',value=st.session_state["threshold"],min_value = 0.0,step=.01)
+        st.write("")
+        clustering_type = st.selectbox(label=f'Select type of clustering',
+                    options = list(dict.keys(cluster_types)), index=0,  key = "twc_cluster_types")
+        st.write("")
+        submit_button = st.form_submit_button('Run')
+        input_status_area = st.empty()
+        display_area = st.empty()
+        if submit_button:
+            start = time.time()
+            if uploaded_file is not None:
+                st.session_state["file_name"]  = uploaded_file.name
+                sentences = StringIO(uploaded_file.getvalue().decode("utf-8")).read()
+            else:
+                st.session_state["file_name"]  = example_file_names[selected_file_index]["name"]
+                sentences = open(example_file_names[selected_file_index]["name"]).read()
+            sentences = sentences.split("\n")[:-1]
+            if (len(sentences) > MAX_INPUT):
+                st.info(f"Input sentence count exceeds maximum sentence limit. First {MAX_INPUT} out of {len(sentences)} sentences chosen")
+                sentences = sentences[:MAX_INPUT]
+            if (len(custom_model_selection) != 0):
+                run_model = custom_model_selection
+            else:
+                run_model = selected_model
+            st.session_state["model_name"] = selected_model
+            st.session_state["threshold"] = threshold
+            st.session_state["overlapped"] = cluster_types[clustering_type]["type"]
+            results = run_test(model_names,run_model,st.session_state["file_name"],sentences,display_area,threshold,(uploaded_file is not None),(len(custom_model_selection) != 0),cluster_types[clustering_type]["type"])
+            display_area.empty()
+            with display_area.container():
+                if ("error" in results):
+                    st.error(results["error"])
+                else:
+                    device = 'GPU' if torch.cuda.is_available() else 'CPU'
+                    response_info = f"Computation time on {device}: {time.time() - start:.2f} secs for {len(sentences)} sentences"
+                    if (len(custom_model_selection) != 0):
+                        st.info("Custom model overrides model selection in step 2 above. So please clear the custom model text box to choose models from step 2")
+                    display_results(sentences,results,response_info,app_mode,run_model)
+                    #st.json(results)
+      st.download_button(
+         label="Download results as json",
+         data= st.session_state["download_ready"] if st.session_state["download_ready"] != None else "",
+         disabled = False if st.session_state["download_ready"] != None else True,
+         file_name= (st.session_state["model_name"] + "_" +  str(st.session_state["threshold"]) + "_" + st.session_state["overlapped"] + "_" +  '_'.join(st.session_state["file_name"].split(".")[:-1]) + ".json").replace("/","_"),
+         mime='text/json',
+         key ="download"
+        )
+  except Exception as e:
+    st.error("Some error occurred during loading" + str(e))
+    st.stop()
+  st.markdown(markdown_str, unsafe_allow_html=True)
+if __name__ == "__main__":
+   #print("comand line input:",len(sys.argv),str(sys.argv))
+   #app_main(sys.argv[1],sys.argv[2],sys.argv[3])
+   #app_main("1","sim_app_examples.json","sim_app_models.json")
+   app_main("3","clus_app_examples.json","clus_app_models.json","clus_app_clustypes.json")

clus_app_clustypes.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+"Overlapped clustering (cluster size determined by zscore)": {"type":"overlapped"},
+"Non-overlapped clustering (overlapped clusters aggregated)":{"type":"non-overlapped"}
+}

clus_app_examples.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+"Machine learning terms (phrases test)": {"name":"small_test.txt"},
+"Customer feedback mixed with noise":{"name":"larger_test.txt"},
+"Movie reviews": {"name":"imdb_sent.txt"}
+}

clus_app_models.json ADDED Viewed

	@@ -0,0 +1,150 @@

+[
+            {   "name":"sentence-transformers/all-MiniLM-L6-v2",
+                "model":"sentence-transformers/all-MiniLM-L6-v2",
+                "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
+                "orig_author_url":"https://github.com/UKPLab",
+                "orig_author":"Ubiquitous Knowledge Processing Lab",
+                "sota_info": {
+                                 "task":"Over 3.8  million downloads from Huggingface",
+                                 "sota_link":"https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2"
+                            },
+                "paper_url":"https://arxiv.org/abs/1908.10084",
+                "mark":"True",
+                "class":"HFModel"},
+            {   "name":"sentence-transformers/paraphrase-MiniLM-L6-v2",
+                "model":"sentence-transformers/paraphrase-MiniLM-L6-v2",
+                "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
+                "orig_author_url":"https://github.com/UKPLab",
+                "orig_author":"Ubiquitous Knowledge Processing Lab",
+                "sota_info": {
+                                 "task":"Over 2 million downloads from Huggingface",
+                                 "sota_link":"https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L6-v2"
+                            },
+                "paper_url":"https://arxiv.org/abs/1908.10084",
+                "mark":"True",
+                "class":"HFModel"},
+            {   "name":"sentence-transformers/bert-base-nli-mean-tokens",
+                "model":"sentence-transformers/bert-base-nli-mean-tokens",
+                "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
+                "orig_author_url":"https://github.com/UKPLab",
+                "orig_author":"Ubiquitous Knowledge Processing Lab",
+                "sota_info": {
+                                 "task":"Over 700,000 downloads from Huggingface",
+                                 "sota_link":"https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens"
+                            },
+                "paper_url":"https://arxiv.org/abs/1908.10084",
+                "mark":"True",
+                "class":"HFModel"},
+            {   "name":"sentence-transformers/all-mpnet-base-v2",
+                "model":"sentence-transformers/all-mpnet-base-v2",
+                "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
+                "orig_author_url":"https://github.com/UKPLab",
+                "orig_author":"Ubiquitous Knowledge Processing Lab",
+                "sota_info": {
+                                 "task":"Over 500,000 downloads from Huggingface",
+                                 "sota_link":"https://huggingface.co/sentence-transformers/all-mpnet-base-v2"
+                            },
+                "paper_url":"https://arxiv.org/abs/1908.10084",
+                "mark":"True",
+                "class":"HFModel"},
+            {   "name":"sentence-transformers/all-MiniLM-L12-v2",
+                "model":"sentence-transformers/all-MiniLM-L12-v2",
+                "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
+                "orig_author_url":"https://github.com/UKPLab",
+                "orig_author":"Ubiquitous Knowledge Processing Lab",
+                "sota_info": {
+                                 "task":"Over 500,000 downloads from Huggingface",
+                                 "sota_link":"https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2"
+                            },
+                "paper_url":"https://arxiv.org/abs/1908.10084",
+                "mark":"True",
+                "class":"HFModel"},
+            {   "name":"SGPT-125M",
+                "model":"Muennighoff/SGPT-125M-weightedmean-nli-bitfit",
+                "fork_url":"https://github.com/taskswithcode/sgpt",
+                "orig_author_url":"https://github.com/Muennighoff",
+                "orig_author":"Niklas Muennighoff",
+                "sota_info": {
+                                 "task":"#1 in multiple information retrieval & search tasks(smaller variant)",
+                                 "sota_link":"https://paperswithcode.com/paper/sgpt-gpt-sentence-embeddings-for-semantic"
+                            },
+                "paper_url":"https://arxiv.org/abs/2202.08904v5",
+                "mark":"True",
+                "class":"SGPTModel"},
+            {  "name":"SIMCSE-base" ,
+                "model":"princeton-nlp/sup-simcse-roberta-base",
+                "fork_url":"https://github.com/taskswithcode/SimCSE",
+                "orig_author_url":"https://github.com/princeton-nlp",
+                "orig_author":"Princeton Natural Language Processing",
+                "sota_info": {
+                                 "task":"Within top 10 in multiple semantic textual similarity tasks(smaller variant)",
+                                 "sota_link":"https://paperswithcode.com/paper/simcse-simple-contrastive-learning-of"
+                            },
+                "paper_url":"https://arxiv.org/abs/2104.08821v4",
+                "mark":"True",
+                "class":"SimCSEModel","sota_link":"https://paperswithcode.com/sota/semantic-textual-similarity-on-sick"},
+            {  "name":"GPT-3-175B (text-similarity-davinci-001)" ,
+                "model":"text-similarity-davinci-001",
+                "fork_url":"https://openai.com/api/",
+                "orig_author_url":"https://openai.com/api/",
+                "orig_author":"OpenAI",
+                "sota_info": {
+                                 "task":"GPT-3 achieves strong zero-shot and few-shot performance on many NLP datasets etc.",
+                                 "sota_link":"https://paperswithcode.com/method/gpt-3"
+                            },
+                "paper_url":"https://arxiv.org/abs/2005.14165v4",
+                "mark":"True",
+                "custom_load":"False",
+                "Note":"Custom file upload requires OpenAI API access to create embeddings. For API access, use this link ",
+                "alt_url":"https://openai.com/api/",
+                "class":"OpenAIModel","sota_link":"https://arxiv.org/abs/2005.14165v4"},
+            {  "name":"GPT-3-6.7B (text-similarity-curie-001)" ,
+                "model":"text-similarity-curie-001",
+                "fork_url":"https://openai.com/api/",
+                "orig_author_url":"https://openai.com/api/",
+                "orig_author":"OpenAI",
+                "sota_info": {
+                                 "task":"GPT-3 achieves strong zero-shot and few-shot performance on many NLP datasets etc.",
+                                 "sota_link":"https://paperswithcode.com/method/gpt-3"
+                            },
+                "paper_url":"https://arxiv.org/abs/2005.14165v4",
+                "mark":"True",
+                "custom_load":"False",
+                "Note":"Custom file upload requires OpenAI API access to create embeddings. For API access, use this link ",
+                "alt_url":"https://openai.com/api/",
+                "class":"OpenAIModel","sota_link":"https://arxiv.org/abs/2005.14165v4"},
+            {  "name":"GPT-3-1.3B (text-similarity-babbage-001)" ,
+                "model":"text-similarity-babbage-001",
+                "fork_url":"https://openai.com/api/",
+                "orig_author_url":"https://openai.com/api/",
+                "orig_author":"OpenAI",
+                "sota_info": {
+                                 "task":"GPT-3 achieves strong zero-shot and few-shot performance on many NLP datasets etc.",
+                                 "sota_link":"https://paperswithcode.com/method/gpt-3"
+                            },
+                "paper_url":"https://arxiv.org/abs/2005.14165v4",
+                "mark":"True",
+                "custom_load":"False",
+                "Note":"Custom file upload requires OpenAI API access to create embeddings. For API access, use this link ",
+                "alt_url":"https://openai.com/api/",
+                "class":"OpenAIModel","sota_link":"https://arxiv.org/abs/2005.14165v4"},
+            {  "name":"GPT-3-350M (text-similarity-ada-001)" ,
+                "model":"text-similarity-ada-001",
+                "fork_url":"https://openai.com/api/",
+                "orig_author_url":"https://openai.com/api/",
+                "orig_author":"OpenAI",
+                "sota_info": {
+                                 "task":"GPT-3 achieves strong zero-shot and few-shot performance on many NLP datasets etc.",
+                                 "sota_link":"https://paperswithcode.com/method/gpt-3"
+                            },
+                "paper_url":"https://arxiv.org/abs/2005.14165v4",
+                "mark":"True",
+                "custom_load":"False",
+                "Note":"Custom file upload requires OpenAI API access to create embeddings. For API access, use this link ",
+                "alt_url":"https://openai.com/api/",
+                "class":"OpenAIModel","sota_link":"https://arxiv.org/abs/2005.14165v4"}
+            ]

imdb_sent.txt ADDED Viewed

	@@ -0,0 +1,62 @@

+"A rating of ""1"" does not begin to express how dull, depressing and relentlessly bad this movie is."
+Hated it with all my being. Worst movie ever. Mentally- scarred. Help me. It was that bad.TRUST ME!!!
+"Long, boring, blasphemous. Never have I been so glad to see ending credits roll."
+This film made John Glover a star. Alan Raimy is one of the most compelling character that I have ever seen on film. And I mean that sport.
+"Were I not with friends, and so cheap, I would have walked out. It failed miserably as satire and didn't even have the redemption of camp."
+For pure gothic vampire cheese nothing can compare to the Subspecies films. I highly recommend each and every one of them.
+"A great film in its genre, the direction, acting, most especially the casting of the film makes it even more powerful. A must see."
+"This is a terrible movie, don't waste your money on it. Don't even watch it for free. That's all I have to say."
+I wouldn't rent this one even on dollar rental night.
+"More suspenseful, more subtle, much, much more disturbing...."
+This is a good film. This is very funny. Yet after this film there were no good Ernest films!
+A touching movie. It is full of emotions and wonderful acting. I could have sat through it a second time.
+"Great movie - especially the music - Etta James - ""At Last"". This speaks volumes when you have finally found that special someone."
+If you've ever had a mad week-end out with your mates then you'll appreciate this film. Excellent fun and a laugh a minute.
+"I think it's one of the greatest movies which are ever made, and I've seen many... The book is better, but it's still a very good movie!"
+Brilliant and moving performances by Tom Courtenay and Peter Finch.
+The characters are unlikeable and the script is awful. It's a waste of the talents of Deneuve and Auteuil.
+You've got to be kidding. This movie sucked for the sci-fi fans. I would only recommend watching this only if you think Armageddon was good.
+Ten minutes of people spewing gallons of pink vomit. Recurring scenes of enormous piles of dog excrement - need one say more???
+"As usual, Sean Connery does a great job. Lawrence Fishburn is good, but I have a hard time not seeing him as Ike Turner."
+This movie is terrible but it has some good effects.
+You'd better choose Paul Verhoeven's even if you have watched it.
+"Brilliant. Ranks along with Citizen Kane, The Matrix and Godfathers. Must see, at least for basset in her early days. Watch it."
+"I don't know why I like this movie so well, but I never get tired of watching it."
+The one-liners fly so fast in this movie that you can watch it over and over and still catch new ones. By far one of the best of this genre.
+"Don't waste your time and money on it. It's not quite as bad as ""Adrenalin"", by the same director but that's not saying much."
+"Read the book, forget the movie!"
+This is a great movie. Too bad it is not available on home video.
+"Very intelligent language usage of Ali, which you musn't miss! In one word: (eeh sentence...) Wicked, so keep it real and pass it on!"
+Primary plot!Primary direction!Poor interpretation.
+"If you like Pauly Shore, you'll love Son in Law. If you hate Pauly Shore, then, well...I liked it!"
+Just love the interplay between two great characters of stage & screen - Veidt & Barrymore
+"This movie will always be a Broadway and Movie classic, as long as there are still people who sing, dance, and act."
+This is the greatest movie ever. If you have written it off with out ever seeing it. You must give it a second try.
+"What a script, what a story, what a mess!"
+"I caught this film late at night on HBO. Talk about wooden acting, unbelievable plot, et al. Very little going in its favor. Skip it."
+This is without a doubt the worst movie I have ever seen. It is not funny. It is not interesting and should not have been made.
+Ming The Merciless does a little Bardwork and a movie most foul!
+This is quite possibly the worst sequel ever made. The script is unfunny and the acting stinks. The exact opposite of the original.
+"This is the definitive movie version of Hamlet. Branagh cuts nothing, but there are no wasted moments."
+My favorite movie. What a great story this really was. I'd just like to be able to buy a copy of it but this does not seem possible.
+"Comment this movie is impossible. Is terrible, very improbable, bad interpretation e direction. Not look!!!!!"
+"Brilliant movie. The drawings were just amazing. Too bad it ended before it begun. I´ve waited 21 years for a sequel, but nooooo!!!"
+a mesmerizing film that certainly keeps your attention... Ben Daniels is fascinating (and courageous) to watch.
+"This is a very cool movie. The ending of the movie is a bit more defined than the play's ending, but either way it is still a good movie."
+"Without a doubt, one of Tobe Hoppor's best! Epic storytellng, great special effects, and The Spacegirl (vamp me baby!)."
+I hope this group of film-makers never re-unites.
+Unwatchable. You can't even make it past the first three minutes. And this is coming from a huge Adam Sandler fan!!1
+"One of the funniest movies made in recent years. Good characterization, plot and exceptional chemistry make this one a classic"
+"Add this little gem to your list of holiday regulars. It is sweet, funny, and endearing"
+"no comment - stupid movie, acting average or worse... screenplay - no sense at all... SKIP IT!"
+"If you haven't seen this, it's terrible. It is pure trash. I saw this about 17 years ago, and I'm still screwed up from it."
+Absolutely fantastic! Whatever I say wouldn't do this underrated movie the justice it deserves. Watch it now! FANTASTIC!
+"As a big fan of Tiny Toon Adventures, I loved this movie!!! It was so funny!!! It really captured how cartoons spent their summers."
+Widow hires a psychopath as a handyman. Sloppy film noir thriller which doesn't make much of its tension promising set-up. (3/10)
+The Fiendish Plot of Dr. Fu Manchu (1980). This is hands down the worst film I've ever seen. What a sad way for a great comedian to go out.
+"Obviously written for the stage. Lightweight but worthwhile. How can you go wrong with Ralph Richardson, Olivier and Merle Oberon."
+This movie turned out to be better than I had expected it to be. Some parts were pretty funny. It was nice to have a movie with a new plot.
+This movie is terrible. It's about some no brain surfin dude that inherits some company. Does Carrot Top have no shame?
+Adrian Pasdar is excellent is this film. He makes a fascinating woman.
+"An unfunny, unworthy picture which is an undeserving end to Peter Sellers' career. It is a pity this movie was ever made."
+"The plot was really weak and confused. This is a true Oprah flick. (In Oprah's world, all men are evil and all women are victims.)"

larger_test.txt ADDED Viewed

	@@ -0,0 +1,52 @@

+do u really want me to unistall this app...whenever i open this app, u ask for review...how many times i have to give review-feedback...
+I don't like how it asks to give a review everytime I open the app
+Stop asking for review everytime I open the app..it's pathetic..the updated version sucks
+If i already provided the review for this application but why this application is asking for reviews every time when I am opening the application so improve this feature. This feature is very irritating. Apart from that overall experience is very good.
+as you guys bother me so much for the review even i gave my opinion already but every time i open the app it ask for review so i gave it 1 star , previsiousally it was 4 star.
+repeatedly asking to rate the app...
+Very irritating. Everytime i open app it asked for review hence giving 2 instead of 4
+stop asking for ratings every time when open the app. I had rated this app 5 star but now every time app asking for give rating, its disgusting. so I'll give only one star
+I swear if i see that feedback ad one more time im gonna uninstall this app and start using another one else
+I'm am downgrading my rating because the app is good and I also gave it 5 satar but why I am getting unnecessary pop up to give it review please fix it
+No rating ... worsted app ... please playstore delete this app
+Much bad experience . when I used to open the app it requires feedback every time.
+I already rated it then why always it pop up... Its irritate me a lot everytime when I open this app... Plz fix this
+This app any time ask me for rating i hate this
+Very Good app but asks to rate it all the time....all these popups are annoying when you are in hurry
+I'm already rated this app. And now from one week and adove this app is asking for rating please solve the problem as soon as possible. Thank you
+The app is too good but it send me notification again and again to rate it that's why am I giving one star to it
+Constantly asks me to rate the app! So annoying.
+Today again i am go to rating this app due to its ad less and best interface with good features again more
+App is very disturbing .. very bad app
+If I don't want to rate it's my personal choice, so why this app gives notification every single time,, it's quite frustrating therefore 1star Other wise app is best for it's work
+For frustrating me every time to rate your app
+Super exalent app can you pls reply to my comment how is my review so thanks to provide this app thanks
+Vey bad app so disturbance ... All time get notification about rating... That allready done
+Earlier I had given 5 stars to this app but even after giving review in this app, it speaks to rate now, so I removed 5 stars in edit review and put 1 star, now this app will be happy.
+Every time I open the app it asking rating. I rated 4 before now I de rate to 1."
+I love this app. So damn good
+This app rocks!!!!!!
+This app totally sucks
+Wow what a useful app
+I cant live without this app!
+Shit! This app rocks. I can never imagine going out without using this app. So damn useful!
+Elon musk is the founder of SpaceX
+Parasites suck blood out of deer
+A review of his conduct revealed he violated the rules everytime he downloaded movies
+My god. If only I could rate this app 100 stars for its excellence
+The board conducted a review and determined electons were fair
+WTF!
+Crossing the chasm is a great book review that is often quoted by readers
+Why am I seeing everything in double like I m drink - is my vision going bad???
+Expolanets keep going round and round their stars many times a day
+I have recommended this app to so many friends and they love it too
+The sale of electric cars has gone up since the increse in gas prices
+Stable diffusion app is the rage on the internet with multiple people either downloading on the laptop and trying it or useing the web interface
+OpenAI is trying to make money by exposin their NLP apps through an API
+Co:here, Ai21 and other are trying to emulate OpenAIs business model and exposing NLP apps that depend on LLMs through metered APIs
+Serverless GPUs have emerged as a new business model catering to end users who want to host apps
+Cerberas, Sambanova are betting on models to grow larger and harder to train on traditional GPUS
+Nvidia released Hopper series as a successor to A100 series
+Oh my god! I am done with this app!
+Oh my god! I love this sweet puppy. He rounds around the chair so many times
+I plan to write a nasty review for this shitty movie

long_form_logo_with_icon.png ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+transformers
+scipy
+torch
+sentencepiece
+openai

run.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ streamlit run app.py --server.port 80 "1" "sim_app_examples.json" "sim_app_models.json"
2	+

small_test.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+machine learning
+Transformers have become the staple architecture for deep learning models
+NLP
+Diffusion models
+natural language processing
+deep learning
+Deep Learning
+Support vector machines
+random forests
+probability distribution
+Cross entropy loss
+Kullback leibler divergence
+Shannon entropy
+Activation functions
+ATM
+deep fakes
+AGI
+AI
+deep trouble
+artificial intelligence
+deep diving
+artificial snow
+shallow waters
+deep end
+RELU
+sigmoid
+GELU
+RNN
+CNN
+Gaussian

text-similarity-ada-001imdb_sent_embed.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-similarity-ada-001larger_test_embed.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-similarity-ada-001small_test_embed.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-similarity-babbage-001imdb_sent_embed.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-similarity-babbage-001larger_test_embed.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-similarity-babbage-001small_test_embed.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-similarity-curie-001imdb_sent_embed.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-similarity-curie-001larger_test_embed.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-similarity-curie-001small_test_embed.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-similarity-davinci-001imdb_sent_embed.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5165d88bbd1b913de4d9bb82cf64d078f92552da1b4556e5f0b0cb436c2332f0
+size 17274908

text-similarity-davinci-001larger_test_embed.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:69a71cb2c1ce371c7372ac75ab631120ed5e508cd85c9872412f59d88b14ca0f
+size 14491019

text-similarity-davinci-001small_test_embed.json ADDED Viewed

The diff for this file is too large to render. See raw diff

twc_clustering.py ADDED Viewed

	@@ -0,0 +1,177 @@

+from scipy.spatial.distance import cosine
+import argparse
+import json
+import pdb
+import torch
+import torch.nn.functional as F
+import numpy as np
+import time
+from collections import OrderedDict
+class TWCClustering:
+    def __init__(self):
+        print("In Zscore  Clustering")
+    def compute_matrix(self,embeddings):
+        #print("Computing similarity matrix ...)")
+        embeddings= np.array(embeddings)
+        start = time.time()
+        vec_a = embeddings.T #vec_a shape (1024,)
+        vec_a = vec_a/np.linalg.norm(vec_a,axis=0) #Norm is along axis 0 - rows
+        vec_a = vec_a.T #vec_a shape becomes (,1024)
+        similarity_matrix = np.inner(vec_a,vec_a)
+        end = time.time()
+        time_val = (end-start)*1000
+        #print(f"Similarity matrix computation complete. Time taken:{(time_val/(1000*60)):.2f}  minutes")
+        return similarity_matrix
+    def get_terms_above_threshold(self,matrix,embeddings,pivot_index,threshold):
+        run_index = pivot_index
+        picked_arr = []
+        while (run_index < len(embeddings)):
+            if (matrix[pivot_index][run_index] >= threshold):
+                picked_arr.append(run_index)
+            run_index += 1
+        return picked_arr
+    def update_picked_dict_arr(self,picked_dict,arr):
+        for i in range(len(arr)):
+            picked_dict[arr[i]] = 1
+    def update_picked_dict(self,picked_dict,in_dict):
+        for key in in_dict:
+            picked_dict[key] = 1
+    def find_pivot_subgraph(self,pivot_index,arr,matrix,threshold,strict_cluster = True):
+        center_index = pivot_index
+        center_score = 0
+        center_dict = {}
+        for i in range(len(arr)):
+            node_i_index = arr[i]
+            running_score = 0
+            temp_dict = {}
+            for j in range(len(arr)):
+                node_j_index = arr[j]
+                cosine_dist = matrix[node_i_index][node_j_index]
+                if ((cosine_dist < threshold) and strict_cluster):
+                    continue
+                running_score += cosine_dist
+                temp_dict[node_j_index] = cosine_dist
+            if (running_score > center_score):
+                center_index = node_i_index
+                center_dict = temp_dict
+                center_score = running_score
+        sorted_d = OrderedDict(sorted(center_dict.items(), key=lambda kv: kv[1], reverse=True))
+        return  {"pivot_index":center_index,"orig_index":pivot_index,"neighs":sorted_d}
+    def update_overlap_stats(self,overlap_dict,cluster_info):
+        arr = list(cluster_info["neighs"].keys())
+        for val in arr:
+            if (val not in overlap_dict):
+                overlap_dict[val] = 1
+            else:
+                overlap_dict[val] += 1
+    def bucket_overlap(self,overlap_dict):
+        bucket_dict = {}
+        for key in overlap_dict:
+            if (overlap_dict[key] not in bucket_dict):
+                bucket_dict[overlap_dict[key]] = 1
+            else:
+                bucket_dict[overlap_dict[key]] += 1
+        sorted_d = OrderedDict(sorted(bucket_dict.items(), key=lambda kv: kv[1], reverse=False))
+        return sorted_d
+    def merge_clusters(self,ref_cluster,curr_cluster):
+        dup_arr = ref_cluster.copy()
+        for j in range(len(curr_cluster)):
+            if (curr_cluster[j] not in dup_arr):
+                ref_cluster.append(curr_cluster[j])
+    def non_overlapped_clustering(self,matrix,embeddings,threshold,mean,std,cluster_dict):
+        picked_dict = {}
+        overlap_dict = {}
+        candidates = []
+        for i in range(len(embeddings)):
+            if (i in picked_dict):
+                continue
+            zscore = mean + threshold*std
+            arr = self.get_terms_above_threshold(matrix,embeddings,i,zscore)
+            candidates.append(arr)
+            self.update_picked_dict_arr(picked_dict,arr)
+        # Merge arrays to create non-overlapping sets
+        run_index_i = 0
+        while (run_index_i < len(candidates)):
+            ref_cluster = candidates[run_index_i]
+            run_index_j = run_index_i + 1
+            found = False
+            while (run_index_j < len(candidates)):
+                curr_cluster = candidates[run_index_j]
+                for k in range(len(curr_cluster)):
+                    if (curr_cluster[k] in ref_cluster):
+                        self.merge_clusters(ref_cluster,curr_cluster)
+                        candidates.pop(run_index_j)
+                        found = True
+                        run_index_i = 0
+                        break
+                if (found):
+                    break
+                else:
+                    run_index_j += 1
+            if (not found):
+                run_index_i += 1
+        zscore = mean + threshold*std
+        for i in range(len(candidates)):
+            arr = candidates[i]
+            cluster_info = self.find_pivot_subgraph(arr[0],arr,matrix,zscore,strict_cluster = False)
+            cluster_dict["clusters"].append(cluster_info)
+        return  {}
+    def overlapped_clustering(self,matrix,embeddings,threshold,mean,std,cluster_dict):
+        picked_dict = {}
+        overlap_dict = {}
+        zscore = mean + threshold*std
+        for i in range(len(embeddings)):
+            if (i in picked_dict):
+                continue
+            arr = self.get_terms_above_threshold(matrix,embeddings,i,zscore)
+            cluster_info = self.find_pivot_subgraph(i,arr,matrix,zscore,strict_cluster = True)
+            self.update_picked_dict(picked_dict,cluster_info["neighs"])
+            self.update_overlap_stats(overlap_dict,cluster_info)
+            cluster_dict["clusters"].append(cluster_info)
+        sorted_d = self.bucket_overlap(overlap_dict)
+        return  sorted_d
+    def cluster(self,output_file,texts,embeddings,threshold,clustering_type):
+        is_overlapped = True if clustering_type == "overlapped" else False
+        matrix = self.compute_matrix(embeddings)
+        mean = np.mean(matrix)
+        std = np.std(matrix)
+        zscores = []
+        inc = 0
+        value = mean
+        while (value < 1):
+            zscores.append({"threshold":inc,"cosine":round(value,2)})
+            inc += 1
+            value = mean + inc*std
+        #print("In clustering:",round(std,2),zscores)
+        cluster_dict = {}
+        cluster_dict["clusters"] = []
+        if (is_overlapped):
+            sorted_d = self.overlapped_clustering(matrix,embeddings,threshold,mean,std,cluster_dict)
+        else:
+            sorted_d = self.non_overlapped_clustering(matrix,embeddings,threshold,mean,std,cluster_dict)
+        curr_threshold = f"{threshold} (cosine:{mean+threshold*std:.2f})"
+        cluster_dict["info"] ={"mean":mean,"std":std,"current_threshold":curr_threshold,"zscores":zscores,"overlap":list(sorted_d.items())}
+        return cluster_dict

twc_embeddings.py ADDED Viewed

	@@ -0,0 +1,407 @@

+from transformers import AutoModel, AutoTokenizer
+from transformers import AutoModelForCausalLM
+from scipy.spatial.distance import cosine
+import argparse
+import json
+import pdb
+import torch
+import torch.nn.functional as F
+def read_text(input_file):
+    arr = open(input_file).read().split("\n")
+    return arr[:-1]
+class CausalLMModel:
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.debug = False
+        print("In CausalLMModel Constructor")
+    def init_model(self,model_name = None):
+        # Get our models - The package will take care of downloading the models automatically
+        # For best performance: Muennighoff/SGPT-5.8B-weightedmean-nli-bitfit
+        if (self.debug):
+            print("Init model",model_name)
+        # For best performance: EleutherAI/gpt-j-6B
+        if (model_name is None):
+            model_name = "EleutherAI/gpt-neo-125M"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForCausalLM.from_pretrained(model_name)
+        self.model.eval()
+        self.prompt = 'Documents are searched to find matches with the same content.\nThe document "{}" is a good search result for "'
+    def compute_embeddings(self,input_file_name,input_data,is_file):
+        if (self.debug):
+            print("Computing embeddings for:", input_data[:20])
+        model = self.model
+        tokenizer = self.tokenizer
+        texts = read_text(input_data) if is_file == True else input_data
+        query = texts[0]
+        docs = texts[1:]
+        # Tokenize input texts
+        #print(f"Query: {query}")
+        scores = []
+        for doc in docs:
+            context = self.prompt.format(doc)
+            context_enc = tokenizer.encode(context, add_special_tokens=False)
+            continuation_enc = tokenizer.encode(query, add_special_tokens=False)
+            # Slice off the last token, as we take its probability from the one before
+            model_input = torch.tensor(context_enc+continuation_enc[:-1])
+            continuation_len = len(continuation_enc)
+            input_len, = model_input.shape
+            # [seq_len] -> [seq_len, vocab]
+            logprobs = torch.nn.functional.log_softmax(model(model_input)[0], dim=-1).cpu()
+            # [seq_len, vocab] -> [continuation_len, vocab]
+            logprobs = logprobs[input_len-continuation_len:]
+            # Gather the log probabilities of the continuation tokens -> [continuation_len]
+            logprobs = torch.gather(logprobs, 1, torch.tensor(continuation_enc).unsqueeze(-1)).squeeze(-1)
+            score = torch.sum(logprobs)
+            scores.append(score.tolist())
+        return texts,scores
+    def output_results(self,output_file,texts,scores,main_index = 0):
+        cosine_dict = {}
+        docs = texts[1:]
+        if (self.debug):
+            print("Total sentences",len(texts))
+        assert(len(scores) == len(docs))
+        for i in range(len(docs)):
+            cosine_dict[docs[i]] = scores[i]
+        if (self.debug):
+            print("Input sentence:",texts[main_index])
+        sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
+        if (self.debug):
+            for key in sorted_dict:
+                print("Document score for \"%s\" is: %.3f" % (key[:100], sorted_dict[key]))
+        if (output_file is not None):
+            with open(output_file,"w") as fp:
+                fp.write(json.dumps(sorted_dict,indent=0))
+        return sorted_dict
+class SGPTQnAModel:
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.debug = False
+        print("In SGPT Q&A Constructor")
+    def init_model(self,model_name = None):
+        # Get our models - The package will take care of downloading the models automatically
+        # For best performance: Muennighoff/SGPT-5.8B-weightedmean-nli-bitfit
+        if (self.debug):
+            print("Init model",model_name)
+        if (model_name is None):
+            model_name = "Muennighoff/SGPT-125M-weightedmean-msmarco-specb-bitfit"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+        self.model.eval()
+        self.SPECB_QUE_BOS = self.tokenizer.encode("[", add_special_tokens=False)[0]
+        self.SPECB_QUE_EOS = self.tokenizer.encode("]", add_special_tokens=False)[0]
+        self.SPECB_DOC_BOS = self.tokenizer.encode("{", add_special_tokens=False)[0]
+        self.SPECB_DOC_EOS = self.tokenizer.encode("}", add_special_tokens=False)[0]
+    def tokenize_with_specb(self,texts, is_query):
+        # Tokenize without padding
+        batch_tokens = self.tokenizer(texts, padding=False, truncation=True)
+        # Add special brackets & pay attention to them
+        for seq, att in zip(batch_tokens["input_ids"], batch_tokens["attention_mask"]):
+            if is_query:
+                seq.insert(0, self.SPECB_QUE_BOS)
+                seq.append(self.SPECB_QUE_EOS)
+            else:
+                seq.insert(0, self.SPECB_DOC_BOS)
+                seq.append(self.SPECB_DOC_EOS)
+            att.insert(0, 1)
+            att.append(1)
+        # Add padding
+        batch_tokens = self.tokenizer.pad(batch_tokens, padding=True, return_tensors="pt")
+        return batch_tokens
+    def get_weightedmean_embedding(self,batch_tokens, model):
+        # Get the embeddings
+        with torch.no_grad():
+            # Get hidden state of shape [bs, seq_len, hid_dim]
+            last_hidden_state = self.model(**batch_tokens, output_hidden_states=True, return_dict=True).last_hidden_state
+        # Get weights of shape [bs, seq_len, hid_dim]
+        weights = (
+            torch.arange(start=1, end=last_hidden_state.shape[1] + 1)
+            .unsqueeze(0)
+            .unsqueeze(-1)
+            .expand(last_hidden_state.size())
+            .float().to(last_hidden_state.device)
+        )
+        # Get attn mask of shape [bs, seq_len, hid_dim]
+        input_mask_expanded = (
+            batch_tokens["attention_mask"]
+            .unsqueeze(-1)
+            .expand(last_hidden_state.size())
+            .float()
+        )
+        # Perform weighted mean pooling across seq_len: bs, seq_len, hidden_dim -> bs, hidden_dim
+        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded * weights, dim=1)
+        sum_mask = torch.sum(input_mask_expanded * weights, dim=1)
+        embeddings = sum_embeddings / sum_mask
+        return embeddings
+    def compute_embeddings(self,input_file_name,input_data,is_file):
+        if (self.debug):
+            print("Computing embeddings for:", input_data[:20])
+        model = self.model
+        tokenizer = self.tokenizer
+        texts = read_text(input_data) if is_file == True else input_data
+        queries = [texts[0]]
+        docs = texts[1:]
+        query_embeddings = self.get_weightedmean_embedding(self.tokenize_with_specb(queries, is_query=True), self.model)
+        doc_embeddings = self.get_weightedmean_embedding(self.tokenize_with_specb(docs, is_query=False), self.model)
+        return texts,(query_embeddings,doc_embeddings)
+    def output_results(self,output_file,texts,embeddings,main_index = 0):
+        # Calculate cosine similarities
+        # Cosine similarities are in [-1, 1]. Higher means more similar
+        query_embeddings = embeddings[0]
+        doc_embeddings = embeddings[1]
+        cosine_dict = {}
+        queries = [texts[0]]
+        docs = texts[1:]
+        if (self.debug):
+            print("Total sentences",len(texts))
+        for i in range(len(docs)):
+            cosine_dict[docs[i]] = 1 - cosine(query_embeddings[0], doc_embeddings[i])
+        if (self.debug):
+            print("Input sentence:",texts[main_index])
+        sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
+        if (self.debug):
+            for key in sorted_dict:
+                print("Cosine similarity with  \"%s\" is: %.3f" % (key, sorted_dict[key]))
+        if (output_file is not None):
+            with open(output_file,"w") as fp:
+                fp.write(json.dumps(sorted_dict,indent=0))
+        return sorted_dict
+class SimCSEModel:
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.debug = False
+        print("In SimCSE constructor")
+    def init_model(self,model_name = None):
+        if (model_name == None):
+            model_name = "princeton-nlp/sup-simcse-roberta-large"
+        #self.model = SimCSE(model_name)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+    def compute_embeddings(self,input_file_name,input_data,is_file):
+        texts = read_text(input_data) if is_file == True else input_data
+        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
+        with torch.no_grad():
+            embeddings = self.model(**inputs, output_hidden_states=True, return_dict=True).pooler_output
+        return texts,embeddings
+    def output_results(self,output_file,texts,embeddings,main_index = 0):
+        # Calculate cosine similarities
+        # Cosine similarities are in [-1, 1]. Higher means more similar
+        cosine_dict = {}
+        #print("Total sentences",len(texts))
+        for i in range(len(texts)):
+                cosine_dict[texts[i]] = 1 - cosine(embeddings[main_index], embeddings[i])
+        #print("Input sentence:",texts[main_index])
+        sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
+        if (self.debug):
+            for key in sorted_dict:
+                print("Cosine similarity with  \"%s\" is: %.3f" % (key, sorted_dict[key]))
+        if (output_file is not None):
+            with open(output_file,"w") as fp:
+                fp.write(json.dumps(sorted_dict,indent=0))
+        return sorted_dict
+class SGPTModel:
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.debug = False
+        print("In SGPT Constructor")
+    def init_model(self,model_name = None):
+        # Get our models - The package will take care of downloading the models automatically
+        # For best performance: Muennighoff/SGPT-5.8B-weightedmean-nli-bitfit
+        if (self.debug):
+            print("Init model",model_name)
+        if (model_name is None):
+            model_name = "Muennighoff/SGPT-125M-weightedmean-nli-bitfit"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+        #self.tokenizer = AutoTokenizer.from_pretrained("Muennighoff/SGPT-1.3B-weightedmean-msmarco-specb-bitfit")
+        #self.model = AutoModel.from_pretrained("Muennighoff/SGPT-1.3B-weightedmean-msmarco-specb-bitfit")
+        #self.tokenizer = AutoTokenizer.from_pretrained("Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit")
+        #self.model = AutoModel.from_pretrained("Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit")
+        # Deactivate Dropout (There is no dropout in the above models so it makes no difference here but other SGPT models may have dropout)
+        self.model.eval()
+    def compute_embeddings(self,input_file_name,input_data,is_file):
+        if (self.debug):
+            print("Computing embeddings for:", input_data[:20])
+        model = self.model
+        tokenizer = self.tokenizer
+        texts = read_text(input_data) if is_file == True else input_data
+        # Tokenize input texts
+        batch_tokens = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
+    # Get the embeddings
+        with torch.no_grad():
+            # Get hidden state of shape [bs, seq_len, hid_dim]
+            last_hidden_state = model(**batch_tokens, output_hidden_states=True, return_dict=True).last_hidden_state
+        # Get weights of shape [bs, seq_len, hid_dim]
+        weights = (
+            torch.arange(start=1, end=last_hidden_state.shape[1] + 1)
+            .unsqueeze(0)
+            .unsqueeze(-1)
+            .expand(last_hidden_state.size())
+            .float().to(last_hidden_state.device)
+        )
+        # Get attn mask of shape [bs, seq_len, hid_dim]
+        input_mask_expanded = (
+            batch_tokens["attention_mask"]
+            .unsqueeze(-1)
+            .expand(last_hidden_state.size())
+            .float()
+        )
+        # Perform weighted mean pooling across seq_len: bs, seq_len, hidden_dim -> bs, hidden_dim
+        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded * weights, dim=1)
+        sum_mask = torch.sum(input_mask_expanded * weights, dim=1)
+        embeddings = sum_embeddings / sum_mask
+        return texts,embeddings
+    def output_results(self,output_file,texts,embeddings,main_index = 0):
+        # Calculate cosine similarities
+        # Cosine similarities are in [-1, 1]. Higher means more similar
+        cosine_dict = {}
+        if (self.debug):
+            print("Total sentences",len(texts))
+        for i in range(len(texts)):
+                cosine_dict[texts[i]] = 1 - cosine(embeddings[main_index], embeddings[i])
+        if (self.debug):
+            print("Input sentence:",texts[main_index])
+        sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
+        if (self.debug):
+            for key in sorted_dict:
+                print("Cosine similarity with  \"%s\" is: %.3f" % (key, sorted_dict[key]))
+        if (output_file is not None):
+            with open(output_file,"w") as fp:
+                fp.write(json.dumps(sorted_dict,indent=0))
+        return sorted_dict
+class HFModel:
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.debug = False
+        print("In HF Constructor")
+    def init_model(self,model_name = None):
+        # Get our models - The package will take care of downloading the models automatically
+        # For best performance: Muennighoff/SGPT-5.8B-weightedmean-nli-bitfit
+        #print("Init model",model_name)
+        if (model_name is None):
+            model_name = "sentence-transformers/all-MiniLM-L6-v2"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+        self.model.eval()
+    def mean_pooling(self,model_output, attention_mask):
+        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    def compute_embeddings(self,input_file_name,input_data,is_file):
+        #print("Computing embeddings for:", input_data[:20])
+        model = self.model
+        tokenizer = self.tokenizer
+        texts = read_text(input_data) if is_file == True else input_data
+        encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
+        # Compute token embeddings
+        with torch.no_grad():
+            model_output = model(**encoded_input)
+        # Perform pooling
+        sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
+        # Normalize embeddings
+        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
+        return texts,sentence_embeddings
+    def output_results(self,output_file,texts,embeddings,main_index = 0):
+        # Calculate cosine similarities
+        # Cosine similarities are in [-1, 1]. Higher means more similar
+        cosine_dict = {}
+        #print("Total sentences",len(texts))
+        for i in range(len(texts)):
+                cosine_dict[texts[i]] = 1 - cosine(embeddings[main_index], embeddings[i])
+        #print("Input sentence:",texts[main_index])
+        sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
+        if (self.debug):
+            for key in sorted_dict:
+                print("Cosine similarity with  \"%s\" is: %.3f" % (key, sorted_dict[key]))
+        if (output_file is not None):
+            with open(output_file,"w") as fp:
+                fp.write(json.dumps(sorted_dict,indent=0))
+        return sorted_dict
+if __name__ == '__main__':
+        parser = argparse.ArgumentParser(description='SGPT model for sentence embeddings ',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+        parser.add_argument('-input', action="store", dest="input",required=True,help="Input file with sentences")
+        parser.add_argument('-output', action="store", dest="output",default="output.txt",help="Output file with results")
+        parser.add_argument('-model', action="store", dest="model",default="sentence-transformers/all-MiniLM-L6-v2",help="model name")
+        results = parser.parse_args()
+        obj = HFModel()
+        obj.init_model(results.model)
+        texts, embeddings = obj.compute_embeddings(results.input,results.input,is_file = True)
+        results = obj.output_results(results.output,texts,embeddings)

twc_openai_embeddings.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from scipy.spatial.distance import cosine
+import argparse
+import json
+import os
+import openai
+import pdb
+def read_text(input_file):
+    arr = open(input_file).read().split("\n")
+    return arr[:-1]
+class OpenAIModel:
+    def __init__(self):
+        self.debug = False
+        self.model_name = None
+        self.skip_key = True
+        print("In OpenAI API constructor")
+    def init_model(self,model_name = None):
+        #print("OpenAI: Init model",model_name)
+        openai.api_key = os.getenv("OPENAI_API_KEY")
+        if (openai.api_key == None):
+            openai.api_key = ""
+            print("API key not set")
+        if (len(openai.api_key) == 0 and not self.skip_key):
+                print("Open API key not set")
+        if (model_name is None):
+            self.model_name = "text-similarity-ada-001"
+        else:
+            self.model_name = model_name
+        print("OpenAI: Init model complete",model_name)
+    def compute_embeddings(self,input_file_name,input_data,is_file):
+        if (len(openai.api_key) == 0 and not self.skip_key):
+                print("Open API key not set")
+                return [],[]
+        #print("In compute embeddings after key check")
+        in_file = self.model_name + '.'.join(input_file_name.split('.')[:-1]) + "_embed.json"
+        cached = False
+        try:
+            fp = open(in_file)
+            cached = True
+            embeddings = json.load(fp)
+            print("Using cached embeddings")
+        except:
+            pass
+        texts = read_text(input_data) if is_file == True else input_data
+        if (not cached):
+            print(f"Computing embeddings for {input_file_name} and model {self.model_name}")
+            response = openai.Embedding.create(
+                input=texts,
+                model=self.model_name
+            )
+            embeddings = []
+            for i in range(len(response['data'])):
+                embeddings.append(response['data'][i]['embedding'])
+        if (not cached):
+            with open(in_file,"w") as fp:
+                json.dump(embeddings,fp)
+        return texts,embeddings
+    def output_results(self,output_file,texts,embeddings,main_index = 0):
+        if (len(openai.api_key) == 0 and not self.skip_key):
+                print("Open API key not set")
+                return {}
+        #print("In output results after key check")
+        # Calculate cosine similarities
+        # Cosine similarities are in [-1, 1]. Higher means more similar
+        cosine_dict = {}
+        #print("Total sentences",len(texts))
+        for i in range(len(texts)):
+                cosine_dict[texts[i]] = 1 - cosine(embeddings[main_index], embeddings[i])
+        #print("Input sentence:",texts[main_index])
+        sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
+        if (self.debug):
+            for key in sorted_dict:
+                print("Cosine similarity with  \"%s\" is: %.3f" % (key, sorted_dict[key]))
+        if (output_file is not None):
+            with open(output_file,"w") as fp:
+                fp.write(json.dumps(sorted_dict,indent=0))
+        return sorted_dict
+if __name__ == '__main__':
+        parser = argparse.ArgumentParser(description='OpenAI model for sentence embeddings ',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+        parser.add_argument('-input', action="store", dest="input",required=True,help="Input file with sentences")
+        parser.add_argument('-output', action="store", dest="output",default="output.txt",help="Output file with results")
+        parser.add_argument('-model', action="store", dest="model",default="text-similarity-ada-001",help="model name")
+        results = parser.parse_args()
+        obj = OpenAIModel()
+        obj.init_model(results.model)
+        texts, embeddings = obj.compute_embeddings(results.input,is_file = True)
+        results = obj.output_results(results.output,texts,embeddings)