File size: 3,454 Bytes
1ee5c89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import numpy as np
from sentence_transformers import SentenceTransformer, util
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re

nltk.download('stopwords')
nltk.download('punkt')

STOP_WORDS = list(stopwords.words('english'))
BERTOPIC_REPRESENTATIONS = [
    "KeyBERTInspired",
    "MaximalMarginalRelevance",
]

TRANSFORMERS = ["all-mpnet-base-v2", "multi-qa-mpnet-base-dot-v1"]
TRANSFORMERS_INFO = ["all-mpnet-base-v2: All-round model tuned for many use-cases. "
                     "Trained on a large and diverse dataset of over 1 billion training pairs",

                     "multi-qa-mpnet-base-dot-v1: This model was tuned for semantic search: Given a query/question, "
                     "if can find relevant passages. "
                     "It was trained on a large and diverse set of (question, answer) pairs."
                     ]


def get_bertopic_representation(representation: str):
    if representation == BERTOPIC_REPRESENTATIONS[0]:
        return KeyBERTInspired()
    elif representation == BERTOPIC_REPRESENTATIONS[1]:
        return MaximalMarginalRelevance()
    else:
        return None


def tokenize_explode(df, col):
    df['tokenized'] = df[col].apply(word_tokenize)
    df = df.explode('tokenized')
    df['tokenized'] = df['tokenized'].str.strip()
    df['tokenized'] = df['tokenized'].str.lower()

    return df


def cleanup_tokens(df, col):
    df = df[df[col].apply(lambda x: len(x) > 2)]
    df = df[~df[col].str.contains(r'^(\d+\.?\d*)$', regex=True)]
    df = df[~df[col].isin([p for p in string.punctuation])]
    df = df[df[col].isin(STOP_WORDS) == False]

    return df


def get_embedding_model(transformer=BERTOPIC_REPRESENTATIONS[0]) -> SentenceTransformer:
    """
    get given sentence transformer model
    :param transformer:
    :return:
    """
    sentence_model = SentenceTransformer(transformer)
    return sentence_model


def str_to_vector_list(text_list, sentence_model, replace_dict=None):
    """
    embedding for the given text list using provided embedding model
    :param text_list:
    :param sentence_model:
    :param replace_dict: any values in the string that we may need to replace
    :return:
    """
    text_list = [str(x).replace('[^\w\s]', '') for x in text_list]
    if replace_dict:
        for stp in replace_dict:
            text_list = [str(x).replace(stp, replace_dict[stp]) for x in text_list]

    embeddings = sentence_model.encode(text_list, show_progress_bar=True, batch_size=1000)
    return embeddings.tolist()


def remove_unnecessary_tokens_from_df(df, columns, extra_stopwords=None) -> None:
    """
    removes unnecessary token from the given columns of the dataframe
    :param df:
    :param columns:
    :param extra_stopwords:
    :return:
    """
    df[columns] = df[columns].apply(lambda x: x.str.replace('[^\w\s]', ''))

    if extra_stopwords:
        for stp in extra_stopwords:
            df[columns] = df[columns].apply(lambda x: x.str.replace(stp, ' '))


def cosine_sim_matrix(embeddings_a, embeddings_b) -> np.array:
    """
    finds out cosine similarity matrix for the given embeddings
    :param embeddings_a:
    :param embeddings_b:
    :return: numpy array showing the cosine similarity matrix
    """
    return np.array(
        util.pytorch_cos_sim(embeddings_a, embeddings_b)
    )