File size: 8,715 Bytes
6beb2f0
 
 
 
 
 
 
232b620
f20ae79
 
 
 
232b620
6beb2f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241

import re
import numpy as np

import tiktoken
from langchain.text_splitter import TokenTextSplitter

# Function to cleanly strip quoted strings
def strip_quotes(text):
    if text.startswith('"') and text.endswith('"'):
        return text[1:-1]
    return text

def strtobool(val):
    val = val.lower()
    if val in ('yes', 'true', 't', '1'):
        return True
    elif val in ('no', 'false', 'f', '0'):
        return False
    else:
        raise ValueError(f"Invalid truth value {val}")


def split_camel_case(word):
    # This regular expression pattern matches the transition from a lowercase letter to an uppercase letter
    pattern = re.compile(r'(?<=[a-z])(?=[A-Z])')

    # Replace the matched pattern (the empty string between lowercase and uppercase letters) with a space
    split_word = pattern.sub(' ', word)

    return split_word


# Function to split tokens into chunks
def chunk_tokens(tokens, max_len):
    for i in range(0, len(tokens), max_len):
        yield tokens[i:i + max_len]


def update_nested_dict(d, u):
    for k, v in u.items():
        if isinstance(v, dict):
            d[k] = update_nested_dict(d.get(k, {}), v)
        else:
            d[k] = v
    return d


def cleanInputText(textInputLLM):

    # Sequentially applying all the replacements and cleaning operations on textInputLLM

    # Using regular expressions substitution
    textInputLLM = re.sub(r'\(\'\\n\\n', ' ', textInputLLM)
    textInputLLM = re.sub(r'\(\"\\n\\n', ' ', textInputLLM)
    textInputLLM = re.sub(r'\\n\\n\',\)', ' ', textInputLLM)
    textInputLLM = re.sub(r'\\n\\n\",\)', ' ', textInputLLM)

    # Applying replacements with while loops since we need repetition until conditions are met
    while re.search(r'##\n', textInputLLM):
        textInputLLM = re.sub(r"##\n", '. ', textInputLLM)
    while '###' in textInputLLM:
        textInputLLM = textInputLLM.replace("###", ' ')
    while '##' in textInputLLM:
        textInputLLM = textInputLLM.replace("##", ' ')
    while ' # ' in textInputLLM:
        textInputLLM = textInputLLM.replace(" # ", ' ')
    while '--' in textInputLLM:
        textInputLLM = textInputLLM.replace("--", '-')
    while re.search(r'\\\\-', textInputLLM):
        textInputLLM = re.sub(r"\\\\-", '.', textInputLLM)
    while re.search(r'\*\*\n', textInputLLM):
        textInputLLM = re.sub(r"\*\*\n", '. ', textInputLLM)
    while re.search(r'\*\*\*', textInputLLM):
        textInputLLM = re.sub(r"\*\*\*", ' ', textInputLLM)
    while re.search(r'\*\*', textInputLLM):
        textInputLLM = re.sub(r"\*\*", ' ', textInputLLM)
    while re.search(r' \* ', textInputLLM):
        textInputLLM = re.sub(r" \* ", ' ', textInputLLM)
    while re.search(r'is a program of the\n\nInternational Society for Infectious Diseases', textInputLLM):
        textInputLLM = re.sub(
            r'is a program of the\n\nInternational Society for Infectious Diseases',
            'is a program of the International Society for Infectious Diseases',
            textInputLLM,
            flags=re.M
        )

    # Optionally, if you want to include these commented out operations:
    # while re.search(r'\n\n', textInputLLM):
    #     textInputLLM = re.sub(r'\n\n', '. ', textInputLLM)
    # while re.search(r'\n', textInputLLM):
    #     textInputLLM = re.sub(r'\n', ' ', textInputLLM)

    while re.search(r' \*\.', textInputLLM):
        textInputLLM = re.sub(r' \*\.', ' .', textInputLLM)
    while '  ' in textInputLLM:
        textInputLLM = textInputLLM.replace("  ", ' ')
    while re.search(r'\.\.', textInputLLM):
        textInputLLM = re.sub(r'\.\.', '.', textInputLLM)
    while re.search(r'\. \.', textInputLLM):
        textInputLLM = re.sub(r'\. \.', '.', textInputLLM)

    # Final cleanup replacements
    textInputLLM = re.sub(r'\(\"\.', ' ', textInputLLM)
    textInputLLM = re.sub(r'\(\'\.', ' ', textInputLLM)
    textInputLLM = re.sub(r'\",\)', ' ', textInputLLM)
    textInputLLM = re.sub(r'\',\)', ' ', textInputLLM)

    # Strip leading/trailing whitespaces
    textInputLLM = textInputLLM.strip()

    return textInputLLM



def encoding_getter(encoding_type: str):
    """
    Returns the appropriate encoding based on the given encoding type (either an encoding string or a model name).

    tiktoken supports three encodings used by OpenAI models:

    Encoding name	OpenAI models
    cl100k_base	gpt-4, gpt-3.5-turbo, text-embedding-ada-002
    p50k_base	Codex models, text-davinci-002, text-davinci-003
    r50k_base (or gpt2)	GPT-3 models like davinci

    https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb

    """
    if "k_base" in encoding_type:
        return tiktoken.get_encoding(encoding_type)
    else:
        try:
            my_enc = tiktoken.encoding_for_model(encoding_type)
            return my_enc
        except Exception as err:
            my_enc = tiktoken.get_encoding("cl100k_base")   #default for gpt-4, gpt-3.5-turbo
            return my_enc


def tokenizer(string: str, encoding_type: str) -> list:
    """
    Returns the tokens in a text string using the specified encoding.
    """
    encoding = encoding_getter(encoding_type)
    tokens = encoding.encode(string)
    return tokens


def token_counter(string: str, encoding_type: str) -> int:
    """
    Returns the number of tokens in a text string using the specified encoding.
    """
    num_tokens = len(tokenizer(string, encoding_type))
    return num_tokens


# Function to extract words from a given text
def extract_words(text, putInLower=False):
    # Use regex to find all words (sequences of alphanumeric characters)
    if putInLower:
        return [word.lower() for word in re.findall(r'\b\w+\b', text)]
    else:
        return [word for word in re.findall(r'\b\w+\b', text)]  #re.findall(r'\b\w+\b', text)

# Function to check if all words from 'compound_word' are in the 'word_list'
def all_words_in_list(compound_word, word_list, putInLower=False):
    words_to_check = extract_words(compound_word, putInLower=putInLower)
    if putInLower:
        return all(word.lower() in word_list for word in words_to_check)
    else:
        return all(word in word_list for word in words_to_check)


def row_to_dict_string(rrrow, columnsDict):
    formatted_items = []
    for col in rrrow.index:
        if col not in columnsDict:
            continue
        value = rrrow[col]
        # Check if the value is an instance of a number (int, float, etc.)
        if isinstance(value, (int, float)):
            formatted_items.append(f'"{col}": {value}')  # Use double quotes for keys
        else:
            formatted_items.append(
                f'"{col}": "{value}"')  # Use double quotes for keys and string values
    # Join items and enclose them in {}
    return '{' + ', '.join(formatted_items) + '}'

#
# def row_to_dict_string(rrrow):
#     formatted_items = []
#     for col in rrrow.index:
#         value = rrrow[col]
#         # Check if the value is an instance of a number (int, float, etc.)
#         if isinstance(value, (int, float)):
#             formatted_items.append(f"'{col}': {value}")
#         else:
#             formatted_items.append(f"'{col}': '{value}'")
#     # Join items and enclose them in {}
#     return '{' + ', '.join(formatted_items) + '}'


def rescale_exponential_to_linear(df, column, new_min=0.5, new_max=1.0):
    # Get the original exponential scores
    original_scores = df[column]

    # Normalize the scores to a 0-1 range
    min_score = original_scores.min()
    max_score = original_scores.max()
    normalized_scores = (original_scores - min_score) / (max_score - min_score)

    # Rescale the normalized scores to the interval [0.5, 1.0]
    linear_scores = new_min + (normalized_scores * (new_max - new_min))

    # Assign the linear scores back to the dataframe
    df[column] = linear_scores

    return df


def rescale_exponential_to_logarithmic(df, column, new_min=0.5, new_max=1.0):
    # Ensure all values are positive and greater than zero, because log(0) is undefined
    epsilon = 1e-10
    df[column] = df[column] + epsilon

    # Apply logarithmic transformation
    log_transformed_scores = np.log(df[column])

    # Normalize the log-transformed scores to a 0-1 range
    min_score = log_transformed_scores.min()
    max_score = log_transformed_scores.max()
    normalized_log_scores = (log_transformed_scores - min_score) / (max_score - min_score)

    # Rescale the normalized scores to the interval [0.5, 1.0]
    logarithmic_scores = new_min + (normalized_log_scores * (new_max - new_min))

    # Assign the logarithmically scaled scores back to the dataframe
    df[column] = logarithmic_scores

    return df