File size: 13,423 Bytes
5ecde30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
import streamlit as st
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from time import time
import numpy as np
from src.A_Preprocess import clean_text
from src.E_Summarization import simple_summarize_text #, summarize_text
from src.E_Model_utils import get_transformes_embeddings, load_model, get_embeddings
from src.E_Faiss_utils import load_faiss_index, normalize_embeddings

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


st.header('Watson Assistant VDF TOBi improvement')
st.write('The model is trained on the TOBi 🤖 intents in Romanian language.')
'---'
#st.write('🤖') 
#:robot_face:





model_name = st.sidebar.radio("Selectează modelul 👇", ["MiniLM-L12-v2","llama3.2-1b","all-MiniLM-L6-v2","bert-base-romanian-cased-v1","multilingual-e5-small","e5_small_fine_tuned_model","all-distilroberta-v1"])
# Load the saved embeddings
#model_name = "xlm-roberta-base"  # Choose the desired model
#model_name = "xlm-r-distilroberta-base-paraphrase-v1"
#'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
# Model path
 


# Load the trained model
if model_name: 
    if model_name == "bert-base-romanian-cased-v1":
            transformer_model_name = "dumitrescustefan/bert-base-romanian-cased-v1"
    if model_name == "llama3.2-1b":
            infloat_model_name = "AlexHung29629/sgpt-llama3.2-1b-stage1"
    if model_name == "MiniLM-L12-v2":
            infloat_model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
            model_name = "paraphrase-multilingual-MiniLM-L12-v2"
    if model_name == "multilingual-e5-small":
            infloat_model_name = "intfloat/multilingual-e5-small"
    elif model_name == "e5_small_fine_tuned_model":
            infloat_model_name = r"output\fine-tuned-model"
            local_only = "local_files_only = True"
    elif model_name == "all-MiniLM-L6-v2":
            infloat_model_name = "sentence-transformers/all-MiniLM-L6-v2"
    elif model_name == "all-distilroberta-v1":
            infloat_model_name = "sentence-transformers/all-distilroberta-v1"
    else:
        st.write("Choose a model")

    st.write(f"Model **{model_name}** loaded successfully!")

# Load the embeddings and the index on button push 
if 'index_loaded' not in st.session_state:
    st.session_state.index_loaded = False
if 'index' not in st.session_state:
    st.session_state.index = None
if 'pdf_button_enabled' not in st.session_state:
    st.session_state.pdf_button_enabled = False
if 'data' not in st.session_state:
    st.session_state.data = None
if 'intent_button_clicked' not in st.session_state:
    st.session_state.intent_button_clicked = False
if 'intent' not in st.session_state:
    st.session_state.intent = None
if 'similarity' not in st.session_state:
    st.session_state.similarity = None
if 'model' not in st.session_state:
    st.session_state.model = None
if 'summar_model' not in st.session_state:
    st.session_state.summar_model = None
if 'summarized_text' not in st.session_state:
    st.session_state.summarized_text = None
if 'csv_copied' not in st.session_state:
    st.session_state.csv_copied = False
if 'csv_file_path' not in st.session_state:
    st.session_state.csv_file_path = r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\Pager_Intents_cleaned.csv'
if 'copied_csv_file_path' not in st.session_state:
    st.session_state.copied_csv_file_path = r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\Pager_Intents_cleaned_Copy.csv'
if 'user_text' not in st.session_state:
    st.session_state.user_text = ""
if 'user_utterance_updated' not in st.session_state:
    st.session_state.user_utterance_updated = r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\User_utterances_updated.csv'

# Function to create a copy of the CSV file
def create_csv_copy():
    df = pd.read_csv(st.session_state.csv_file_path)
    df.to_csv(st.session_state.copied_csv_file_path, index=False)
    st.session_state.csv_copied = True
    st.success("CSV file copied successfully.")

# Function to add user text and intent to the copied CSV file
def add_user_text_and_intent():
    if st.session_state.csv_copied:
        df = pd.read_csv(st.session_state.copied_csv_file_path)
        new_row = {'utterance': st.session_state.user_text, 'intent': st.session_state.intent, 'similarity': st.session_state.similarity}   
        st.write(new_row)
        df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) 

        csv_file_path = f'{st.session_state.copied_csv_file_path}'
   
        df.to_csv(csv_file_path, index=False)
        st.success("User text and intent added to the copied CSV file successfully.")

# First button: Load Embeddings and Index
if st.button("Load Embeddings and Index"):
    if model_name == "e5_small_fine_tuned_model":
        model = SentenceTransformer(r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\src\output\fine-tuned-model\e5_small_fine_tuned_model', local_files_only = True)
             
        # Vocab Size
        vocab_size = model.tokenizer.vocab_size
        st.write(f"**Vocab Size:** {vocab_size}")
        
        # Max Sequence Length
        max_len = model.max_seq_length
        st.write(f"**Max Sequence Length:** {max_len}")

        st.session_state.model = model
    elif model_name == "bert-base-romanian-cased-v1":
        tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")
        model = AutoModel.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")
        st.session_state.model = model
    else:
        model = SentenceTransformer(infloat_model_name)
        st.session_state.model = model

    index = load_faiss_index(f"embeddings/{model_name}_vector_db.index")
    st.session_state.index = index
    st.session_state.index_loaded = True
    st.write("Embeddings and index loaded successfully!")



# File uploader: Only available after the second button is clicked
if st.session_state.index_loaded == True:
    '-------------------'
    st.write(f'✨ Load the csv file?')
    uploaded_file = st.file_uploader("Search the csv file", type="csv")
    
    if uploaded_file is not None:
        st.session_state.data = pd.read_csv(uploaded_file)
        st.write("CSV file successfully uploaded!")
        st.write(st.session_state.data)  # Display uploaded data
    
    # If file is already uploaded, maintain it in session state
    elif st.session_state.data is not None:
        st.write("Previously uploaded data:")
        st.write(st.session_state.data[:5])  # Display first 5 rows of uploaded data


# If data is loaded, allow user to input text and identify intent
data = st.session_state.data
...
if st.session_state.data is not None:
    #ask for user input text - in english   
    '-------------------'

    user_text = st.text_area("👇 Enter user utterance text:", placeholder= 'User text') 
    st.write(f'Text length: {len(user_text)}')
    # Step 5: Process the text if it's provided
    if user_text:
        if len(user_text) > 150:
            st.write("The text is too long. Please summarize it.")
            summarize_button = st.button("Summarize")
            if summarize_button:
                st.session_state.summarized_text = simple_summarize_text(user_text)
                user_text = st.session_state.summarized_text
                st.write(f"The summarized text: {user_text}")


        # Store the user text in session state
        st.session_state.user_text = user_text


        start = time()

        # Clean the user input text
        cleaned_text = clean_text(user_text)
        
        # Get embeddings for the cleaned text using the loaded model
        model = st.session_state.model

        if model_name == "bert-base-romanian-cased-v1":
            tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")
            model = AutoModel.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")
            input_embedding = get_transformes_embeddings([cleaned_text], model, tokenizer)
        else:
             input_embedding = get_embeddings(model, [cleaned_text])
        
        # Normalize the embedding
        normalized_embedding = normalize_embeddings(input_embedding)
        
        # Store the embedding in session state
        st.session_state.input_embedding = normalized_embedding

        st.session_state.cleaned_text = cleaned_text

        # Display "Identifică Intenția" button
        intent_button = st.button("Calculate Intent and Similarity")

        # Store whether the button was clicked
        if intent_button:
            st.session_state.intent_button_clicked = True

# Step 6: If the intent button is clicked, find the closest intent using FAISS
if st.session_state.intent_button_clicked and st.session_state.input_embedding is not None:
    start = time()
    # Perform a search using FAISS to find the closest embedding match
    index = st.session_state.index
    D, I = index.search(st.session_state.input_embedding, 1)  # Searching for the closest neighbor

    intents = st.session_state.data['intent'].tolist()
    intent = intents[I[0][0]]  # Fetch the most similar intent
    distance = D[0][0]
    similarity = 1 / (1 + distance)  # Calculate similarity from distance

    # Store intent and similarity in session state to persist results
    st.session_state.intent = intent
    st.session_state.similarity = similarity

    # Display the results
    st.write(f"Intent: {intent}")
    st.write(f"Confidence: {similarity:.4f}")
    st.write(f"Timp de răspuns: {time() - start:.4f} secunde")

    # Button to confirm adding user text and intent to the copied CSV file


    '-------------------'
    st.write(f'✨ Correct Intent: **{intent}**?')
    if st.button("Append User Text and Intent"):
        create_csv_copy()
        add_user_text_and_intent()


    '-------------------'

if 'utt_csv_file' not in st.session_state:
    st.session_state.utt_csv_file = None
if 'utt_intent_results_df' not in st.session_state:
    st.session_state.utt_intent_results_df = None
if 'utt_csv_file_df' not in st.session_state:
    st.session_state.utt_csv_file_df = None

# Function to perform similarity/intent search on a CSV file
def apply_similarity_search(df):
    # Load the CSV file
    #display only the utterance and intent columns
    #display_loaded_df = df[['utterance','intent']]
    #st.write(display_loaded_df)

    # Check if 'utterance' column exists
    if 'utterance' not in df.columns:
        raise KeyError("The column 'utterance' does not exist in the DataFrame.")

    # Generate embeddings for each utterance
    utterances = df['utterance'].tolist()
    embeddings = st.session_state.model.encode(utterances)
    embeddings = np.array(embeddings).astype('float32')

    # Perform similarity search for each embedding
    intents = st.session_state.data['intent'].tolist()
    for i, embedding in enumerate(embeddings):
        D, I = st.session_state.index.search(np.expand_dims(embedding, axis=0), 1)
        intent = intents[I[0][0]]
        df.at[i, 'intent'] = intent

    # Save the updated DataFrame back to the CSV file
    csv_file_name = st.session_state.utt_csv_file.name
    df.to_csv(f'Updated_{csv_file_name}', index=False)

    return df

# First button: Load utterance file
if st.session_state.similarity and st.session_state.utt_csv_file is None:
    st.header('✨ Auto-update the utterances list without intent')
    csv_file = st.file_uploader("Load User utterances file", type="csv")
    if csv_file is not None:
        st.session_state.utt_csv_file = csv_file
    # Load the CSV file
        df = pd.read_csv(csv_file, encoding='windows-1252')
        st.session_state.utt_csv_file_df = df
    #display only the utterance and intent columns
        display_df = df[['utterance','intent']]
        st.write(display_df)
        st.success("Utterance file loaded successfully.")
elif st.session_state.similarity and st.session_state.utt_csv_file_df is not None:
    st.write("Utterance file already loaded.")
    df = st.session_state.utt_csv_file_df
    #display only the utterance and intent columns
    display_df = df[['utterance','intent']]
    st.write(display_df)

# Second button: Apply Similarity Search to CSV
if st.session_state.utt_csv_file is not None and st.button("Apply Similarity Search to CSV"):
    st.write("Performing similarity search on the uploaded CSV file...")
    df = st.session_state.utt_csv_file_df
    results_df = apply_similarity_search(df)
    st.session_state.utt_intent_results_df = results_df
    #st.write("Results:")
    #st.dataframe(results_df.head())

# Display the results if available
if st.session_state.utt_intent_results_df is not None:
    st.write("Results:")
    
    df = st.session_state.utt_intent_results_df
    #display only the utterance and intent columns
    display_results_df = df[['utterance','intent']]
    st.write(display_results_df)
    st.write(f"Timp de răspuns: {time() - start:.4f} secunde")

# Optional: Display previous results if the process was already done
#if st.session_state.intent is not None:
#    st.write(f"Intenția identificată anterior: {st.session_state.intent}")
#    st.write(f"Nivel de încredere anterior: {st.session_state.similarity:.4f}")

# Stop the Streamlit app
st.stop()