rkrstacic commited on
Commit
caa4374
·
1 Parent(s): 3ab21b8

Delete gradioapptest.py

Browse files
Files changed (1) hide show
  1. gradioapptest.py +0 -431
gradioapptest.py DELETED
@@ -1,431 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """GradioAppTest.ipynb
3
-
4
- Automatically generated by Colaboratory.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/1QhxoNhhM_kcaoQOyz5hsNWLcf2m2L225
8
- """
9
-
10
- !pip install gradio
11
- !pip install transformers
12
-
13
- import gradio as gr
14
- from transformers import pipeline
15
-
16
- """## JSON"""
17
-
18
- # Define the process that the models will be trained for
19
- trainedProcess = "praksa"
20
- trainedProcessJSON = "Praksa"
21
-
22
- json = [
23
- {
24
- "name": "Praksa",
25
- "phases": [
26
- {
27
- "name": "Odabir preferencija",
28
- "alias": ["Prijava prakse", "Odabir zadatka", "Prvi korak"],
29
- "description": "Odabir preferencija je prvi korak u procesu polaganja prakse. Zahtjeva da student odabere zadatak sa popisa...",
30
- "duration": "1 mjesec",
31
- },
32
- {
33
- "name": "Ispunjavanje prijavnice",
34
- "description": "Ispunjavanje prijavnice je drugi korak u procesu polaganja prakse. Student mora ispuniti prijavnicu koja se nalazi na stranici kolegija...",
35
- "duration": "1 tjedan",
36
- },
37
- {
38
- "name": "Predaja dnevnika prakse",
39
- "alias": ["Završetak prakse", "Dnevnik"],
40
- "description": "Predaja dnevnika prakse zadnji je korak u procesu polaganja prakse. S završetkom rada, student predaje dnevnik prakse na stranicu kolegija...",
41
- "duration": "3 dana",
42
- },
43
- ],
44
- "duration": "2 mjeseca",
45
- },
46
- {
47
- "name": "Izrada završnog rada",
48
- "phases": [
49
- {
50
- "name": "Prijava teme",
51
- "alias": ["Prvi korak"],
52
- "description": "Prvi korak u procesu izrade završnog rada je prijava teme. Zahtjeva da student odabere mentora te prijavi temu sa popisa...",
53
- "duration": "5 dana",
54
- },
55
- {
56
- "name": "Ispuna obrasca",
57
- "description": "Student ispunjava obrazac sa prijavljenom temom...",
58
- "duration": "4 dana",
59
- },
60
- {
61
- "name": "Obrana rada",
62
- "description": "Student brani svoj rad pred komosijom...",
63
- "duration": "1 sat",
64
- },
65
- ],
66
- "duration": "3 mjeseca",
67
- },
68
- ]
69
-
70
- # If tasks do not contain alias propery, assign an empty one to them
71
- for process in json:
72
- for task in process["phases"]:
73
- if "alias" not in task:
74
- task["alias"] = []
75
-
76
- """## User intent recognition model
77
-
78
- CPU ~6m
79
-
80
- GPU ~3m
81
- """
82
-
83
- # Define training epochs
84
- training_epochs = 10
85
- label_size = 6
86
-
87
- # Define dataset URL for training
88
- UIDatasetURL = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSPR-FPTMBcYRynP4JdwYQQ8dAhSx1x8i1LPckUcuIUUlrWT82b5Thqb1bBNnPeGJPxxX1CJAlFSd6F/pub?output=xlsx'
89
-
90
- # Will require runetime restart on Google colab (sometimes, idk)
91
- !pip install tensorflow_text
92
-
93
- !pip install text-hr
94
-
95
- """### Data loading"""
96
-
97
- import tensorflow as tf
98
- import tensorflow_text as tft
99
- import tensorflow_hub as tfh
100
- import pandas as pd
101
- import numpy as np
102
- import seaborn as sns
103
- import matplotlib.pyplot as plt
104
-
105
- # Text preprocessor for bert based models
106
- preprocessor = tfh.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-preprocess/2')
107
-
108
- # Language Agnostic BERT sentence encoder
109
- model = tfh.KerasLayer('https://tfhub.dev/google/LaBSE/2')
110
-
111
- # Read the data
112
- import pandas as pd
113
- data = pd.read_excel(UIDatasetURL)
114
-
115
- columns = ['text', 'intent', 'process']
116
- data.columns = columns
117
-
118
- data = data[data["process"] == trainedProcess].drop(columns="process")
119
-
120
- """#### Category merging"""
121
-
122
- # Convert categories to codes
123
- data['intent'] = data['intent'].astype('category')
124
- data['intent_codes'] = data['intent'].cat.codes
125
-
126
- # Display the distribution of codes
127
- values = data['intent'].value_counts()
128
- plt.stem(values)
129
-
130
- """#### Normalize data
131
-
132
- ### Text preprocessing
133
-
134
- 1. Remove punctuation
135
- 2. Lowercase the text
136
- 3. Apply tokenization
137
- 4. Remove stopwords
138
- 5. Apply lemmatizer
139
- """
140
-
141
- import string
142
- import re
143
- import nltk
144
- import text_hr
145
-
146
- nltk.download('stopwords')
147
- nltk.download('wordnet')
148
- nltk.download('omw-1.4')
149
- from nltk.stem.porter import PorterStemmer
150
- from nltk.stem import WordNetLemmatizer
151
-
152
- def remove_punctuation(text):
153
- return "".join([i for i in text if i not in string.punctuation])
154
-
155
- def tokenization(text):
156
- return re.split(r"\s+",text)
157
-
158
- stopwords = nltk.corpus.stopwords.words('english')
159
- def remove_stopwords(text):
160
- return [i for i in text if i not in stopwords]
161
-
162
- porter_stemmer = PorterStemmer()
163
- def stemming(text):
164
- return [porter_stemmer.stem(word) for word in text]
165
-
166
- wordnet_lemmatizer = WordNetLemmatizer()
167
- def lemmatizer(text):
168
- return [wordnet_lemmatizer.lemmatize(word) for word in text]
169
-
170
- data['text'] = data['text']\
171
- .apply(lambda x: remove_punctuation(x))\
172
- .apply(lambda x: x.lower())\
173
- .apply(lambda x: tokenization(x))\
174
- .apply(lambda x: lemmatizer(x))
175
-
176
- stop_words_list_hr = []
177
- for word_base, l_key, cnt, _suff_id, wform_key, wform in text_hr.get_all_std_words():
178
- if word_base is not None: stop_words_list_hr.append(word_base)
179
- if wform is not None: stop_words_list_hr.append(wform)
180
-
181
- stop_words_list_hr = list(dict.fromkeys(stop_words_list_hr))
182
-
183
- def remove_stopwords_hr(text):
184
- output = [i for i in text if i not in stop_words_list_hr]
185
- return output
186
-
187
- data['text'] = data['text'].apply(lambda x: remove_stopwords_hr(x))
188
-
189
- data['text'] = data['text'].str.join(" ")
190
-
191
- """### Split validation and training data
192
-
193
- Train 75%, validation 25%
194
- """
195
-
196
- codes = data['intent_codes'].unique()
197
-
198
- # Variable to understand the meaning behind codes
199
- CODES_REPR = data[["intent_codes", "intent"]].drop_duplicates().sort_values("intent_codes")
200
-
201
-
202
- def codeToIntent(prediction) -> str:
203
- """ Returns the intent of the prediction, not the code """
204
- return CODES_REPR[CODES_REPR["intent_codes"] == prediction.argmax()].iloc[0]["intent"]
205
-
206
- preprocessed_validation_data = pd.DataFrame(columns=data.columns)
207
- preprocessed_train_data = pd.DataFrame(columns=data.columns)
208
-
209
- for c in codes:
210
- sample = data[data['intent_codes'] == c]
211
- sample = sample.sample(frac=1)
212
- # val = sample.sample(frac=0.25)
213
- val = sample.sample(frac=0)
214
- train = pd.concat([sample, val]).drop_duplicates(keep=False)
215
- preprocessed_validation_data = preprocessed_validation_data.append(val, ignore_index=True)
216
- preprocessed_train_data = preprocessed_train_data.append(train, ignore_index=True)
217
-
218
- # Preprocessed google translation data
219
- train_data_eng = preprocessed_train_data[['text', 'intent_codes']]
220
- train_data_eng.columns = ['text', 'intent_codes']
221
-
222
- validation_data_eng = preprocessed_validation_data[['text', 'intent_codes']]
223
- validation_data_eng.columns = ['text', 'intent_codes']
224
-
225
- def df_to_dataset(df, shuffle=True, batch_size=16):
226
- df = df.copy()
227
- labels = df.pop('intent_codes')
228
- lables_cat = tf.keras.utils.to_categorical(labels, label_size)
229
- dataset = tf.data.Dataset.from_tensor_slices((dict(df), lables_cat))
230
- if shuffle:
231
- dataset = dataset.shuffle(buffer_size=len(df))
232
- dataset = dataset.batch(batch_size).prefetch(batch_size)
233
- return dataset
234
-
235
- _validation = train_data_eng
236
- train_data_eng = df_to_dataset(train_data_eng)
237
-
238
- # validation_data_eng = df_to_dataset(validation_data_eng)
239
- validation_data_eng = df_to_dataset(_validation)
240
-
241
- """### Model definition and training
242
-
243
- 2 epochs training (testing purposes)
244
- """
245
-
246
- # Model builder
247
- def model_build():
248
- inputs = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
249
- encoded_input = preprocessor(inputs)
250
- encoder_outputs = model(encoded_input)
251
-
252
- x = encoder_outputs['pooled_output']
253
- x = tf.keras.layers.Dropout(0.1)(x)
254
- x = tf.keras.layers.Dense(128, activation='relu')(x)
255
- x = tf.keras.layers.Dropout(0.7)(x)
256
- outputs = tf.keras.layers.Dense(label_size, activation='softmax', name='classifier')(x)
257
-
258
- return tf.keras.Model(inputs, outputs)
259
-
260
- # Build a model with preprocessed data
261
- model_eng = model_build()
262
- model_eng.compile(
263
- optimizer = tf.keras.optimizers.Adam(0.001),
264
- loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True),
265
- metrics = tf.keras.metrics.CategoricalAccuracy()
266
- )
267
-
268
- eng_history = model_eng.fit(
269
- train_data_eng,
270
- epochs = training_epochs,
271
- batch_size = 16,
272
- validation_data = validation_data_eng,
273
- )
274
-
275
- """## Data extraction pipeline"""
276
-
277
- !pip install transformers
278
-
279
- from transformers import pipeline
280
-
281
- pipe = pipeline("token-classification", model="rkrstacic/bpmn-task-extractor")
282
-
283
- """## Sentence similarity"""
284
-
285
- !pip install -U sentence-transformers
286
-
287
- import numpy as np
288
- from typing import List, Dict
289
-
290
- # Function that shows the result
291
- def predictNER(text: str) -> Dict:
292
- currentString = "".join([x["word"] for x in pipe(text) if x["entity"] != "LABEL_0"])
293
-
294
- # Return dictionary without empty values
295
- return { "Task": currentString.replace("▁", " ")[1:] }
296
-
297
- from sentence_transformers import SentenceTransformer, util
298
-
299
- model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
300
-
301
- from typing import List
302
- import torch
303
-
304
- def getTaskSimilarityIndex(flatIndex: int, tasks) -> int:
305
- """ Get task index based on the flatten task list """
306
- for index, task in enumerate(tasks):
307
- if flatIndex <= len(task["alias"]):
308
- return index
309
-
310
- flatIndex -= len(task["alias"]) + 1
311
-
312
- return -1
313
-
314
- def getFlattenTasks(tasks) -> List[str]:
315
- """ Returns the flatten version of task names and their aliases """
316
- resTasks = []
317
-
318
- for task in tasks:
319
- resTasks.append(task["name"])
320
- resTasks = resTasks + task["alias"]
321
-
322
- return resTasks
323
-
324
- def taskSimilarity(text: str, tasks) -> int:
325
- """ Returns the task index which is the most similar to the text """
326
- return getTaskSimilarityIndex(torch.argmax(util.pytorch_cos_sim(
327
- model.encode(text, convert_to_tensor=True),
328
- model.encode(getFlattenTasks(tasks), convert_to_tensor=True)
329
- )).item(), tasks)
330
-
331
- """## Using the user intent model"""
332
-
333
- def preprocessText(text: str) -> str:
334
- """ Do the same preprocessing as the UI model training input data """
335
- text = remove_punctuation(text)
336
- text = text.lower()
337
- text = tokenization(text)
338
- text = lemmatizer(text)
339
- text = remove_stopwords_hr(text)
340
-
341
- return " ".join(text)
342
-
343
- def predict_intent(text: str) -> str:
344
- """ Predict the text intent based on the abovetrained model """
345
- return codeToIntent(model_eng.predict([preprocessText(text)], verbose=False))
346
-
347
- def getPhases(phases) -> str:
348
- """ P1: Returns the formatted phases """
349
- phases = [phase["name"].lower() for phase in phases]
350
- return ', '.join(phases[:-1]) + ' i ' + phases[-1]
351
-
352
- # Define functions that handle output text formatting
353
-
354
- def getP1String(process) -> str:
355
- return f"Faze procesa za proces '{process['name']}' su: {getPhases(process['phases'])}"
356
-
357
- def getP2String(process) -> str:
358
- return f"Proces '{process['name']}' traje {process['duration']}"
359
-
360
- def getP3String(taskName: str, task) -> str:
361
- return f"Kratki opis '{taskName}': {task['description']}"
362
-
363
- def getP4String(taskName: str, task) -> str:
364
- return f"Proces '{taskName}' traje {task['duration']}"
365
-
366
- def getP5String(taskIndex: int, taskName: str, process) -> str:
367
- if len(process["phases"]) <= taskIndex + 1:
368
- return f"'{taskName}' je zadnji korak u procesu '{process['name']}'"
369
-
370
- return f"Nakon '{taskName}' je '{process['phases'][taskIndex + 1]['name'].lower()}'"
371
-
372
- def getP6String() -> str:
373
- return "Nažalost, ne razumijem Vaše pitanje"
374
-
375
- def print_result(text: str, process) -> None:
376
- """ Chatbot output messages based on intent """
377
- intent = predict_intent(text)
378
- taskIndex = taskSimilarity(text, process["phases"])
379
- task = process["phases"][taskIndex]
380
- taskName = task["name"].lower()
381
-
382
- # P1: Koje su faze
383
- if intent == 'P1':
384
- return(getP1String(process))
385
-
386
- # P2: Koliko traje cijeli proces
387
- elif intent == 'P2':
388
- return(getP2String(process))
389
-
390
- # P3: Kako ide odabir preferencija?
391
- elif intent == 'P3':
392
- return(getP3String(taskName, task))
393
-
394
- # P4: Koliko traje {task}
395
- elif intent == 'P4':
396
- return(getP4String(taskName, task))
397
-
398
- # P5: Što je nakon {task}
399
- elif intent == 'P5':
400
- return(getP5String(taskIndex, taskName, process))
401
-
402
- # Ništa od navedenog
403
- else:
404
- return(getP6String())
405
-
406
- def chatbot(input_text) -> None:
407
- """ By: Rafael Krstačić """
408
- processName = trainedProcessJSON
409
- currentProcess = None
410
-
411
- for process in json:
412
- if process["name"] == processName:
413
- currentProcess = process
414
- break
415
- else:
416
- raise KeyError("Process does not exist in json")
417
-
418
- return print_result(input_text, currentProcess)
419
-
420
- """## Gradio app"""
421
-
422
- chatbot("Koliko traje predaja dnevnika prakse")
423
-
424
- iface = gr.Interface(
425
- fn=chatbot,
426
- inputs="text",
427
- outputs=["text"],
428
- title="Sentiment Analysis"
429
- )
430
-
431
- iface.launch()