Spaces:
Running
Running
File size: 8,715 Bytes
6beb2f0 232b620 f20ae79 232b620 6beb2f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 |
import re
import numpy as np
import tiktoken
from langchain.text_splitter import TokenTextSplitter
# Function to cleanly strip quoted strings
def strip_quotes(text):
if text.startswith('"') and text.endswith('"'):
return text[1:-1]
return text
def strtobool(val):
val = val.lower()
if val in ('yes', 'true', 't', '1'):
return True
elif val in ('no', 'false', 'f', '0'):
return False
else:
raise ValueError(f"Invalid truth value {val}")
def split_camel_case(word):
# This regular expression pattern matches the transition from a lowercase letter to an uppercase letter
pattern = re.compile(r'(?<=[a-z])(?=[A-Z])')
# Replace the matched pattern (the empty string between lowercase and uppercase letters) with a space
split_word = pattern.sub(' ', word)
return split_word
# Function to split tokens into chunks
def chunk_tokens(tokens, max_len):
for i in range(0, len(tokens), max_len):
yield tokens[i:i + max_len]
def update_nested_dict(d, u):
for k, v in u.items():
if isinstance(v, dict):
d[k] = update_nested_dict(d.get(k, {}), v)
else:
d[k] = v
return d
def cleanInputText(textInputLLM):
# Sequentially applying all the replacements and cleaning operations on textInputLLM
# Using regular expressions substitution
textInputLLM = re.sub(r'\(\'\\n\\n', ' ', textInputLLM)
textInputLLM = re.sub(r'\(\"\\n\\n', ' ', textInputLLM)
textInputLLM = re.sub(r'\\n\\n\',\)', ' ', textInputLLM)
textInputLLM = re.sub(r'\\n\\n\",\)', ' ', textInputLLM)
# Applying replacements with while loops since we need repetition until conditions are met
while re.search(r'##\n', textInputLLM):
textInputLLM = re.sub(r"##\n", '. ', textInputLLM)
while '###' in textInputLLM:
textInputLLM = textInputLLM.replace("###", ' ')
while '##' in textInputLLM:
textInputLLM = textInputLLM.replace("##", ' ')
while ' # ' in textInputLLM:
textInputLLM = textInputLLM.replace(" # ", ' ')
while '--' in textInputLLM:
textInputLLM = textInputLLM.replace("--", '-')
while re.search(r'\\\\-', textInputLLM):
textInputLLM = re.sub(r"\\\\-", '.', textInputLLM)
while re.search(r'\*\*\n', textInputLLM):
textInputLLM = re.sub(r"\*\*\n", '. ', textInputLLM)
while re.search(r'\*\*\*', textInputLLM):
textInputLLM = re.sub(r"\*\*\*", ' ', textInputLLM)
while re.search(r'\*\*', textInputLLM):
textInputLLM = re.sub(r"\*\*", ' ', textInputLLM)
while re.search(r' \* ', textInputLLM):
textInputLLM = re.sub(r" \* ", ' ', textInputLLM)
while re.search(r'is a program of the\n\nInternational Society for Infectious Diseases', textInputLLM):
textInputLLM = re.sub(
r'is a program of the\n\nInternational Society for Infectious Diseases',
'is a program of the International Society for Infectious Diseases',
textInputLLM,
flags=re.M
)
# Optionally, if you want to include these commented out operations:
# while re.search(r'\n\n', textInputLLM):
# textInputLLM = re.sub(r'\n\n', '. ', textInputLLM)
# while re.search(r'\n', textInputLLM):
# textInputLLM = re.sub(r'\n', ' ', textInputLLM)
while re.search(r' \*\.', textInputLLM):
textInputLLM = re.sub(r' \*\.', ' .', textInputLLM)
while ' ' in textInputLLM:
textInputLLM = textInputLLM.replace(" ", ' ')
while re.search(r'\.\.', textInputLLM):
textInputLLM = re.sub(r'\.\.', '.', textInputLLM)
while re.search(r'\. \.', textInputLLM):
textInputLLM = re.sub(r'\. \.', '.', textInputLLM)
# Final cleanup replacements
textInputLLM = re.sub(r'\(\"\.', ' ', textInputLLM)
textInputLLM = re.sub(r'\(\'\.', ' ', textInputLLM)
textInputLLM = re.sub(r'\",\)', ' ', textInputLLM)
textInputLLM = re.sub(r'\',\)', ' ', textInputLLM)
# Strip leading/trailing whitespaces
textInputLLM = textInputLLM.strip()
return textInputLLM
def encoding_getter(encoding_type: str):
"""
Returns the appropriate encoding based on the given encoding type (either an encoding string or a model name).
tiktoken supports three encodings used by OpenAI models:
Encoding name OpenAI models
cl100k_base gpt-4, gpt-3.5-turbo, text-embedding-ada-002
p50k_base Codex models, text-davinci-002, text-davinci-003
r50k_base (or gpt2) GPT-3 models like davinci
https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
"""
if "k_base" in encoding_type:
return tiktoken.get_encoding(encoding_type)
else:
try:
my_enc = tiktoken.encoding_for_model(encoding_type)
return my_enc
except Exception as err:
my_enc = tiktoken.get_encoding("cl100k_base") #default for gpt-4, gpt-3.5-turbo
return my_enc
def tokenizer(string: str, encoding_type: str) -> list:
"""
Returns the tokens in a text string using the specified encoding.
"""
encoding = encoding_getter(encoding_type)
tokens = encoding.encode(string)
return tokens
def token_counter(string: str, encoding_type: str) -> int:
"""
Returns the number of tokens in a text string using the specified encoding.
"""
num_tokens = len(tokenizer(string, encoding_type))
return num_tokens
# Function to extract words from a given text
def extract_words(text, putInLower=False):
# Use regex to find all words (sequences of alphanumeric characters)
if putInLower:
return [word.lower() for word in re.findall(r'\b\w+\b', text)]
else:
return [word for word in re.findall(r'\b\w+\b', text)] #re.findall(r'\b\w+\b', text)
# Function to check if all words from 'compound_word' are in the 'word_list'
def all_words_in_list(compound_word, word_list, putInLower=False):
words_to_check = extract_words(compound_word, putInLower=putInLower)
if putInLower:
return all(word.lower() in word_list for word in words_to_check)
else:
return all(word in word_list for word in words_to_check)
def row_to_dict_string(rrrow, columnsDict):
formatted_items = []
for col in rrrow.index:
if col not in columnsDict:
continue
value = rrrow[col]
# Check if the value is an instance of a number (int, float, etc.)
if isinstance(value, (int, float)):
formatted_items.append(f'"{col}": {value}') # Use double quotes for keys
else:
formatted_items.append(
f'"{col}": "{value}"') # Use double quotes for keys and string values
# Join items and enclose them in {}
return '{' + ', '.join(formatted_items) + '}'
#
# def row_to_dict_string(rrrow):
# formatted_items = []
# for col in rrrow.index:
# value = rrrow[col]
# # Check if the value is an instance of a number (int, float, etc.)
# if isinstance(value, (int, float)):
# formatted_items.append(f"'{col}': {value}")
# else:
# formatted_items.append(f"'{col}': '{value}'")
# # Join items and enclose them in {}
# return '{' + ', '.join(formatted_items) + '}'
def rescale_exponential_to_linear(df, column, new_min=0.5, new_max=1.0):
# Get the original exponential scores
original_scores = df[column]
# Normalize the scores to a 0-1 range
min_score = original_scores.min()
max_score = original_scores.max()
normalized_scores = (original_scores - min_score) / (max_score - min_score)
# Rescale the normalized scores to the interval [0.5, 1.0]
linear_scores = new_min + (normalized_scores * (new_max - new_min))
# Assign the linear scores back to the dataframe
df[column] = linear_scores
return df
def rescale_exponential_to_logarithmic(df, column, new_min=0.5, new_max=1.0):
# Ensure all values are positive and greater than zero, because log(0) is undefined
epsilon = 1e-10
df[column] = df[column] + epsilon
# Apply logarithmic transformation
log_transformed_scores = np.log(df[column])
# Normalize the log-transformed scores to a 0-1 range
min_score = log_transformed_scores.min()
max_score = log_transformed_scores.max()
normalized_log_scores = (log_transformed_scores - min_score) / (max_score - min_score)
# Rescale the normalized scores to the interval [0.5, 1.0]
logarithmic_scores = new_min + (normalized_log_scores * (new_max - new_min))
# Assign the logarithmically scaled scores back to the dataframe
df[column] = logarithmic_scores
return df |