Spaces:

rkrstacic
/

Software-module-for-answering-questions-on-processes

Runtime error

App Files Files Community

rkrstacic commited on Sep 12, 2022

Commit

caa4374

1 Parent(s): 3ab21b8

Delete gradioapptest.py

Browse files

Files changed (1) hide show

gradioapptest.py +0 -431

gradioapptest.py DELETED Viewed

@@ -1,431 +0,0 @@
-# -*- coding: utf-8 -*-
-"""GradioAppTest.ipynb
-Automatically generated by Colaboratory.
-Original file is located at
-    https://colab.research.google.com/drive/1QhxoNhhM_kcaoQOyz5hsNWLcf2m2L225
-"""
-!pip install gradio
-!pip install transformers
-import gradio as gr
-from transformers import pipeline
-"""## JSON"""
-# Define the process that the models will be trained for
-trainedProcess = "praksa"
-trainedProcessJSON = "Praksa"
-json = [
-    {
-        "name": "Praksa",
-        "phases": [
-            {
-                "name": "Odabir preferencija",
-                "alias": ["Prijava prakse", "Odabir zadatka", "Prvi korak"],
-                "description": "Odabir preferencija je prvi korak u procesu polaganja prakse. Zahtjeva da student odabere zadatak sa popisa...",
-                "duration": "1 mjesec",
-            },
-            {
-                "name": "Ispunjavanje prijavnice",
-                "description": "Ispunjavanje prijavnice je drugi korak u procesu polaganja prakse. Student mora ispuniti prijavnicu koja se nalazi na stranici kolegija...",
-                "duration": "1 tjedan",
-            },
-            {
-                "name": "Predaja dnevnika prakse",
-                "alias": ["Završetak prakse", "Dnevnik"],
-                "description": "Predaja dnevnika prakse zadnji je korak u procesu polaganja prakse. S završetkom rada, student predaje dnevnik prakse na stranicu kolegija...",
-                "duration": "3 dana",
-            },
-        ],
-        "duration": "2 mjeseca",
-    },
-    {
-        "name": "Izrada završnog rada",
-        "phases": [
-            {
-                "name": "Prijava teme",
-                "alias": ["Prvi korak"],
-                "description": "Prvi korak u procesu izrade završnog rada je prijava teme. Zahtjeva da student odabere mentora te prijavi temu sa popisa...",
-                "duration": "5 dana",
-            },
-            {
-                "name": "Ispuna obrasca",
-                "description": "Student ispunjava obrazac sa prijavljenom temom...",
-                "duration": "4 dana",
-            },
-            {
-                "name": "Obrana rada",
-                "description": "Student brani svoj rad pred komosijom...",
-                "duration": "1 sat",
-            },
-        ],
-        "duration": "3 mjeseca",
-    },
-]
-# If tasks do not contain alias propery, assign an empty one to them
-for process in json:
-    for task in process["phases"]:
-        if "alias" not in task:
-            task["alias"] = []
-"""## User intent recognition model
-CPU ~6m
-GPU ~3m
-"""
-# Define training epochs
-training_epochs = 10
-label_size = 6
-# Define dataset URL for training
-UIDatasetURL = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSPR-FPTMBcYRynP4JdwYQQ8dAhSx1x8i1LPckUcuIUUlrWT82b5Thqb1bBNnPeGJPxxX1CJAlFSd6F/pub?output=xlsx'
-# Will require runetime restart on Google colab (sometimes, idk)
-!pip install tensorflow_text
-!pip install text-hr
-"""### Data loading"""
-import tensorflow as tf
-import tensorflow_text as tft
-import tensorflow_hub as tfh
-import pandas as pd
-import numpy as np
-import seaborn as sns
-import matplotlib.pyplot as plt
-# Text preprocessor for bert based models
-preprocessor = tfh.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-preprocess/2')
-# Language Agnostic BERT sentence encoder
-model = tfh.KerasLayer('https://tfhub.dev/google/LaBSE/2')
-# Read the data
-import pandas as pd
-data = pd.read_excel(UIDatasetURL)
-columns = ['text', 'intent', 'process']
-data.columns = columns
-data = data[data["process"] == trainedProcess].drop(columns="process")
-"""#### Category merging"""
-# Convert categories to codes
-data['intent'] = data['intent'].astype('category')
-data['intent_codes'] = data['intent'].cat.codes
-# Display the distribution of codes
-values = data['intent'].value_counts()
-plt.stem(values)
-"""#### Normalize data
-### Text preprocessing
-1. Remove punctuation
-2. Lowercase the text
-3. Apply tokenization
-4. Remove stopwords
-5. Apply lemmatizer
-"""
-import string
-import re
-import nltk
-import text_hr
-nltk.download('stopwords')
-nltk.download('wordnet')
-nltk.download('omw-1.4')
-from nltk.stem.porter import PorterStemmer
-from nltk.stem import WordNetLemmatizer
-def remove_punctuation(text):
-    return "".join([i for i in text if i not in string.punctuation])
-def tokenization(text):
-    return re.split(r"\s+",text)
-stopwords = nltk.corpus.stopwords.words('english')
-def remove_stopwords(text):
-    return [i for i in text if i not in stopwords]
-porter_stemmer = PorterStemmer()
-def stemming(text):
-    return [porter_stemmer.stem(word) for word in text]
-wordnet_lemmatizer = WordNetLemmatizer()
-def lemmatizer(text):
-    return [wordnet_lemmatizer.lemmatize(word) for word in text]
-data['text'] = data['text']\
-    .apply(lambda x: remove_punctuation(x))\
-    .apply(lambda x: x.lower())\
-    .apply(lambda x: tokenization(x))\
-    .apply(lambda x: lemmatizer(x))
-stop_words_list_hr = []
-for word_base, l_key, cnt, _suff_id, wform_key, wform in text_hr.get_all_std_words():
-    if word_base is not None: stop_words_list_hr.append(word_base)
-    if wform is not None: stop_words_list_hr.append(wform)
-stop_words_list_hr = list(dict.fromkeys(stop_words_list_hr))
-def remove_stopwords_hr(text):
-    output = [i for i in text if i not in stop_words_list_hr]
-    return output
-data['text'] = data['text'].apply(lambda x: remove_stopwords_hr(x))
-data['text'] = data['text'].str.join(" ")
-"""### Split validation and training data
-Train 75%, validation 25%
-"""
-codes = data['intent_codes'].unique()
-# Variable to understand the meaning behind codes
-CODES_REPR = data[["intent_codes", "intent"]].drop_duplicates().sort_values("intent_codes")
-def codeToIntent(prediction) -> str:
-    """ Returns the intent of the prediction, not the code """
-    return CODES_REPR[CODES_REPR["intent_codes"] == prediction.argmax()].iloc[0]["intent"]
-preprocessed_validation_data = pd.DataFrame(columns=data.columns)
-preprocessed_train_data = pd.DataFrame(columns=data.columns)
-for c in codes:
-    sample = data[data['intent_codes'] == c]
-    sample = sample.sample(frac=1)
-    # val = sample.sample(frac=0.25)
-    val = sample.sample(frac=0)
-    train = pd.concat([sample, val]).drop_duplicates(keep=False)
-    preprocessed_validation_data = preprocessed_validation_data.append(val, ignore_index=True)
-    preprocessed_train_data = preprocessed_train_data.append(train, ignore_index=True)
-# Preprocessed google translation data
-train_data_eng = preprocessed_train_data[['text', 'intent_codes']]
-train_data_eng.columns = ['text', 'intent_codes']
-validation_data_eng = preprocessed_validation_data[['text', 'intent_codes']]
-validation_data_eng.columns = ['text', 'intent_codes']
-def df_to_dataset(df, shuffle=True, batch_size=16):
-    df = df.copy()
-    labels = df.pop('intent_codes')
-    lables_cat = tf.keras.utils.to_categorical(labels, label_size)
-    dataset = tf.data.Dataset.from_tensor_slices((dict(df), lables_cat))
-    if shuffle:
-        dataset = dataset.shuffle(buffer_size=len(df))
-    dataset = dataset.batch(batch_size).prefetch(batch_size)
-    return dataset
-_validation = train_data_eng
-train_data_eng = df_to_dataset(train_data_eng)
-# validation_data_eng = df_to_dataset(validation_data_eng)
-validation_data_eng = df_to_dataset(_validation)
-"""### Model definition and training
-2 epochs training (testing purposes)
-"""
-# Model builder
-def model_build():
-    inputs = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
-    encoded_input = preprocessor(inputs)
-    encoder_outputs = model(encoded_input)
-    x = encoder_outputs['pooled_output']
-    x = tf.keras.layers.Dropout(0.1)(x)
-    x = tf.keras.layers.Dense(128, activation='relu')(x)
-    x = tf.keras.layers.Dropout(0.7)(x)
-    outputs = tf.keras.layers.Dense(label_size, activation='softmax', name='classifier')(x)
-    return tf.keras.Model(inputs, outputs)
-# Build a model with preprocessed data
-model_eng = model_build()
-model_eng.compile(
-    optimizer = tf.keras.optimizers.Adam(0.001),
-    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True),
-    metrics = tf.keras.metrics.CategoricalAccuracy()
-)
-eng_history = model_eng.fit(
-    train_data_eng,
-    epochs = training_epochs,
-    batch_size = 16,
-    validation_data = validation_data_eng,
-)
-"""## Data extraction pipeline"""
-!pip install transformers
-from transformers import pipeline
-pipe = pipeline("token-classification", model="rkrstacic/bpmn-task-extractor")
-"""## Sentence similarity"""
-!pip install -U sentence-transformers
-import numpy as np
-from typing import List, Dict
-# Function that shows the result
-def predictNER(text: str) -> Dict:
-    currentString = "".join([x["word"] for x in pipe(text) if x["entity"] != "LABEL_0"])
-    # Return dictionary without empty values
-    return { "Task": currentString.replace("▁", " ")[1:] }
-from sentence_transformers import SentenceTransformer, util
-model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
-from typing import List
-import torch
-def getTaskSimilarityIndex(flatIndex: int, tasks) -> int:
-    """ Get task index based on the flatten task list """
-    for index, task in enumerate(tasks):
-        if flatIndex <= len(task["alias"]):
-            return index
-        flatIndex -= len(task["alias"]) + 1
-    return -1
-def getFlattenTasks(tasks) -> List[str]:
-    """ Returns the flatten version of task names and their aliases """
-    resTasks = []
-    for task in tasks:
-        resTasks.append(task["name"])
-        resTasks = resTasks + task["alias"]
-    return resTasks
-def taskSimilarity(text: str, tasks) -> int:
-    """ Returns the task index which is the most similar to the text """
-    return getTaskSimilarityIndex(torch.argmax(util.pytorch_cos_sim(
-        model.encode(text, convert_to_tensor=True),
-        model.encode(getFlattenTasks(tasks), convert_to_tensor=True)
-    )).item(), tasks)
-"""## Using the user intent model"""
-def preprocessText(text: str) -> str:
-    """ Do the same preprocessing as the UI model training input data """
-    text = remove_punctuation(text)
-    text = text.lower()
-    text = tokenization(text)
-    text = lemmatizer(text)
-    text = remove_stopwords_hr(text)
-    return " ".join(text)
-def predict_intent(text: str) -> str:
-    """ Predict the text intent based on the abovetrained model """
-    return codeToIntent(model_eng.predict([preprocessText(text)], verbose=False))
-def getPhases(phases) -> str:
-    """ P1: Returns the formatted phases """
-    phases = [phase["name"].lower() for phase in phases]
-    return ', '.join(phases[:-1]) + ' i ' + phases[-1]
-# Define functions that handle output text formatting
-def getP1String(process) -> str:
-    return f"Faze procesa za proces '{process['name']}' su: {getPhases(process['phases'])}"
-def getP2String(process) -> str:
-    return f"Proces '{process['name']}' traje {process['duration']}"
-def getP3String(taskName: str, task) -> str:
-    return f"Kratki opis '{taskName}': {task['description']}"
-def getP4String(taskName: str, task) -> str:
-    return f"Proces '{taskName}' traje {task['duration']}"
-def getP5String(taskIndex: int, taskName: str, process) -> str:
-    if len(process["phases"]) <= taskIndex + 1:
-        return f"'{taskName}' je zadnji korak u procesu '{process['name']}'"
-    return f"Nakon '{taskName}' je '{process['phases'][taskIndex + 1]['name'].lower()}'"
-def getP6String() -> str:
-    return "Nažalost, ne razumijem Vaše pitanje"
-def print_result(text: str, process) -> None:
-    """ Chatbot output messages based on intent """
-    intent = predict_intent(text)
-    taskIndex = taskSimilarity(text, process["phases"])
-    task = process["phases"][taskIndex]
-    taskName = task["name"].lower()
-    # P1: Koje su faze
-    if intent == 'P1':
-        return(getP1String(process))
-    # P2: Koliko traje cijeli proces
-    elif intent == 'P2':
-        return(getP2String(process))
-    # P3: Kako ide odabir preferencija?
-    elif intent == 'P3':
-        return(getP3String(taskName, task))
-    # P4: Koliko traje {task}
-    elif intent == 'P4':
-        return(getP4String(taskName, task))
-    # P5: Što je nakon {task}
-    elif intent == 'P5':
-        return(getP5String(taskIndex, taskName, process))
-    # Ništa od navedenog
-    else:
-        return(getP6String())
-def chatbot(input_text) -> None:
-    """ By: Rafael Krstačić """
-    processName = trainedProcessJSON
-    currentProcess = None
-    for process in json:
-        if process["name"] == processName:
-            currentProcess = process
-            break
-    else:
-        raise KeyError("Process does not exist in json")
-    return print_result(input_text, currentProcess)
-"""## Gradio app"""
-chatbot("Koliko traje predaja dnevnika prakse")
-iface = gr.Interface(
-    fn=chatbot,
-    inputs="text",
-    outputs=["text"],
-    title="Sentiment Analysis"
-)
-iface.launch()