Spaces:
Sleeping
Sleeping
File size: 13,423 Bytes
5ecde30 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 |
import streamlit as st
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from time import time
import numpy as np
from src.A_Preprocess import clean_text
from src.E_Summarization import simple_summarize_text #, summarize_text
from src.E_Model_utils import get_transformes_embeddings, load_model, get_embeddings
from src.E_Faiss_utils import load_faiss_index, normalize_embeddings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
st.header('Watson Assistant VDF TOBi improvement')
st.write('The model is trained on the TOBi 🤖 intents in Romanian language.')
'---'
#st.write('🤖')
#:robot_face:
model_name = st.sidebar.radio("Selectează modelul 👇", ["MiniLM-L12-v2","llama3.2-1b","all-MiniLM-L6-v2","bert-base-romanian-cased-v1","multilingual-e5-small","e5_small_fine_tuned_model","all-distilroberta-v1"])
# Load the saved embeddings
#model_name = "xlm-roberta-base" # Choose the desired model
#model_name = "xlm-r-distilroberta-base-paraphrase-v1"
#'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
# Model path
# Load the trained model
if model_name:
if model_name == "bert-base-romanian-cased-v1":
transformer_model_name = "dumitrescustefan/bert-base-romanian-cased-v1"
if model_name == "llama3.2-1b":
infloat_model_name = "AlexHung29629/sgpt-llama3.2-1b-stage1"
if model_name == "MiniLM-L12-v2":
infloat_model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
model_name = "paraphrase-multilingual-MiniLM-L12-v2"
if model_name == "multilingual-e5-small":
infloat_model_name = "intfloat/multilingual-e5-small"
elif model_name == "e5_small_fine_tuned_model":
infloat_model_name = r"output\fine-tuned-model"
local_only = "local_files_only = True"
elif model_name == "all-MiniLM-L6-v2":
infloat_model_name = "sentence-transformers/all-MiniLM-L6-v2"
elif model_name == "all-distilroberta-v1":
infloat_model_name = "sentence-transformers/all-distilroberta-v1"
else:
st.write("Choose a model")
st.write(f"Model **{model_name}** loaded successfully!")
# Load the embeddings and the index on button push
if 'index_loaded' not in st.session_state:
st.session_state.index_loaded = False
if 'index' not in st.session_state:
st.session_state.index = None
if 'pdf_button_enabled' not in st.session_state:
st.session_state.pdf_button_enabled = False
if 'data' not in st.session_state:
st.session_state.data = None
if 'intent_button_clicked' not in st.session_state:
st.session_state.intent_button_clicked = False
if 'intent' not in st.session_state:
st.session_state.intent = None
if 'similarity' not in st.session_state:
st.session_state.similarity = None
if 'model' not in st.session_state:
st.session_state.model = None
if 'summar_model' not in st.session_state:
st.session_state.summar_model = None
if 'summarized_text' not in st.session_state:
st.session_state.summarized_text = None
if 'csv_copied' not in st.session_state:
st.session_state.csv_copied = False
if 'csv_file_path' not in st.session_state:
st.session_state.csv_file_path = r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\Pager_Intents_cleaned.csv'
if 'copied_csv_file_path' not in st.session_state:
st.session_state.copied_csv_file_path = r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\Pager_Intents_cleaned_Copy.csv'
if 'user_text' not in st.session_state:
st.session_state.user_text = ""
if 'user_utterance_updated' not in st.session_state:
st.session_state.user_utterance_updated = r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\User_utterances_updated.csv'
# Function to create a copy of the CSV file
def create_csv_copy():
df = pd.read_csv(st.session_state.csv_file_path)
df.to_csv(st.session_state.copied_csv_file_path, index=False)
st.session_state.csv_copied = True
st.success("CSV file copied successfully.")
# Function to add user text and intent to the copied CSV file
def add_user_text_and_intent():
if st.session_state.csv_copied:
df = pd.read_csv(st.session_state.copied_csv_file_path)
new_row = {'utterance': st.session_state.user_text, 'intent': st.session_state.intent, 'similarity': st.session_state.similarity}
st.write(new_row)
df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
csv_file_path = f'{st.session_state.copied_csv_file_path}'
df.to_csv(csv_file_path, index=False)
st.success("User text and intent added to the copied CSV file successfully.")
# First button: Load Embeddings and Index
if st.button("Load Embeddings and Index"):
if model_name == "e5_small_fine_tuned_model":
model = SentenceTransformer(r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\src\output\fine-tuned-model\e5_small_fine_tuned_model', local_files_only = True)
# Vocab Size
vocab_size = model.tokenizer.vocab_size
st.write(f"**Vocab Size:** {vocab_size}")
# Max Sequence Length
max_len = model.max_seq_length
st.write(f"**Max Sequence Length:** {max_len}")
st.session_state.model = model
elif model_name == "bert-base-romanian-cased-v1":
tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")
model = AutoModel.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")
st.session_state.model = model
else:
model = SentenceTransformer(infloat_model_name)
st.session_state.model = model
index = load_faiss_index(f"embeddings/{model_name}_vector_db.index")
st.session_state.index = index
st.session_state.index_loaded = True
st.write("Embeddings and index loaded successfully!")
# File uploader: Only available after the second button is clicked
if st.session_state.index_loaded == True:
'-------------------'
st.write(f'✨ Load the csv file?')
uploaded_file = st.file_uploader("Search the csv file", type="csv")
if uploaded_file is not None:
st.session_state.data = pd.read_csv(uploaded_file)
st.write("CSV file successfully uploaded!")
st.write(st.session_state.data) # Display uploaded data
# If file is already uploaded, maintain it in session state
elif st.session_state.data is not None:
st.write("Previously uploaded data:")
st.write(st.session_state.data[:5]) # Display first 5 rows of uploaded data
# If data is loaded, allow user to input text and identify intent
data = st.session_state.data
...
if st.session_state.data is not None:
#ask for user input text - in english
'-------------------'
user_text = st.text_area("👇 Enter user utterance text:", placeholder= 'User text')
st.write(f'Text length: {len(user_text)}')
# Step 5: Process the text if it's provided
if user_text:
if len(user_text) > 150:
st.write("The text is too long. Please summarize it.")
summarize_button = st.button("Summarize")
if summarize_button:
st.session_state.summarized_text = simple_summarize_text(user_text)
user_text = st.session_state.summarized_text
st.write(f"The summarized text: {user_text}")
# Store the user text in session state
st.session_state.user_text = user_text
start = time()
# Clean the user input text
cleaned_text = clean_text(user_text)
# Get embeddings for the cleaned text using the loaded model
model = st.session_state.model
if model_name == "bert-base-romanian-cased-v1":
tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")
model = AutoModel.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")
input_embedding = get_transformes_embeddings([cleaned_text], model, tokenizer)
else:
input_embedding = get_embeddings(model, [cleaned_text])
# Normalize the embedding
normalized_embedding = normalize_embeddings(input_embedding)
# Store the embedding in session state
st.session_state.input_embedding = normalized_embedding
st.session_state.cleaned_text = cleaned_text
# Display "Identifică Intenția" button
intent_button = st.button("Calculate Intent and Similarity")
# Store whether the button was clicked
if intent_button:
st.session_state.intent_button_clicked = True
# Step 6: If the intent button is clicked, find the closest intent using FAISS
if st.session_state.intent_button_clicked and st.session_state.input_embedding is not None:
start = time()
# Perform a search using FAISS to find the closest embedding match
index = st.session_state.index
D, I = index.search(st.session_state.input_embedding, 1) # Searching for the closest neighbor
intents = st.session_state.data['intent'].tolist()
intent = intents[I[0][0]] # Fetch the most similar intent
distance = D[0][0]
similarity = 1 / (1 + distance) # Calculate similarity from distance
# Store intent and similarity in session state to persist results
st.session_state.intent = intent
st.session_state.similarity = similarity
# Display the results
st.write(f"Intent: {intent}")
st.write(f"Confidence: {similarity:.4f}")
st.write(f"Timp de răspuns: {time() - start:.4f} secunde")
# Button to confirm adding user text and intent to the copied CSV file
'-------------------'
st.write(f'✨ Correct Intent: **{intent}**?')
if st.button("Append User Text and Intent"):
create_csv_copy()
add_user_text_and_intent()
'-------------------'
if 'utt_csv_file' not in st.session_state:
st.session_state.utt_csv_file = None
if 'utt_intent_results_df' not in st.session_state:
st.session_state.utt_intent_results_df = None
if 'utt_csv_file_df' not in st.session_state:
st.session_state.utt_csv_file_df = None
# Function to perform similarity/intent search on a CSV file
def apply_similarity_search(df):
# Load the CSV file
#display only the utterance and intent columns
#display_loaded_df = df[['utterance','intent']]
#st.write(display_loaded_df)
# Check if 'utterance' column exists
if 'utterance' not in df.columns:
raise KeyError("The column 'utterance' does not exist in the DataFrame.")
# Generate embeddings for each utterance
utterances = df['utterance'].tolist()
embeddings = st.session_state.model.encode(utterances)
embeddings = np.array(embeddings).astype('float32')
# Perform similarity search for each embedding
intents = st.session_state.data['intent'].tolist()
for i, embedding in enumerate(embeddings):
D, I = st.session_state.index.search(np.expand_dims(embedding, axis=0), 1)
intent = intents[I[0][0]]
df.at[i, 'intent'] = intent
# Save the updated DataFrame back to the CSV file
csv_file_name = st.session_state.utt_csv_file.name
df.to_csv(f'Updated_{csv_file_name}', index=False)
return df
# First button: Load utterance file
if st.session_state.similarity and st.session_state.utt_csv_file is None:
st.header('✨ Auto-update the utterances list without intent')
csv_file = st.file_uploader("Load User utterances file", type="csv")
if csv_file is not None:
st.session_state.utt_csv_file = csv_file
# Load the CSV file
df = pd.read_csv(csv_file, encoding='windows-1252')
st.session_state.utt_csv_file_df = df
#display only the utterance and intent columns
display_df = df[['utterance','intent']]
st.write(display_df)
st.success("Utterance file loaded successfully.")
elif st.session_state.similarity and st.session_state.utt_csv_file_df is not None:
st.write("Utterance file already loaded.")
df = st.session_state.utt_csv_file_df
#display only the utterance and intent columns
display_df = df[['utterance','intent']]
st.write(display_df)
# Second button: Apply Similarity Search to CSV
if st.session_state.utt_csv_file is not None and st.button("Apply Similarity Search to CSV"):
st.write("Performing similarity search on the uploaded CSV file...")
df = st.session_state.utt_csv_file_df
results_df = apply_similarity_search(df)
st.session_state.utt_intent_results_df = results_df
#st.write("Results:")
#st.dataframe(results_df.head())
# Display the results if available
if st.session_state.utt_intent_results_df is not None:
st.write("Results:")
df = st.session_state.utt_intent_results_df
#display only the utterance and intent columns
display_results_df = df[['utterance','intent']]
st.write(display_results_df)
st.write(f"Timp de răspuns: {time() - start:.4f} secunde")
# Optional: Display previous results if the process was already done
#if st.session_state.intent is not None:
# st.write(f"Intenția identificată anterior: {st.session_state.intent}")
# st.write(f"Nivel de încredere anterior: {st.session_state.similarity:.4f}")
# Stop the Streamlit app
st.stop()
|