Spaces:

nouf-sst
/

TGRL-bad-smells

Runtime error

App Files Files Community

nouf-sst commited on Apr 13, 2023

Commit

17cde60

1 Parent(s): d570802

Create app.py

Browse files

Files changed (1) hide show

app.py +619 -0

app.py ADDED Viewed

	@@ -0,0 +1,619 @@

+import gradio as gr
+import re
+import json
+import nltk
+import stanza
+from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
+from sentence_transformers import CrossEncoder
+from autocorrect import Speller
+from transformers import BertTokenizer, BertForSequenceClassification
+import torch
+from torch.nn.utils.rnn import pad_sequence
+import numpy as np
+from stanza.server import CoreNLPClient
+# ********************* Setting up Stanford CoreNLP *********************
+# Download the Stanford CoreNLP package with Stanza's installation command
+# This'll take several minutes, depending on the network speed
+corenlp_dir = './corenlp'
+stanza.install_corenlp(dir=corenlp_dir)
+# Set the CORENLP_HOME environment variable to point to the installation location
+import os
+os.environ["CORENLP_HOME"] = corenlp_dir
+# Construct a CoreNLPClient with some basic annotators, a memory allocation of 4GB, and port number 9001
+client = CoreNLPClient(
+    annotators=['tokenize','ssplit', 'pos', 'lemma', 'ner', 'parse'],
+    memory='4G',
+    endpoint='http://localhost:9001',
+    be_quiet=True)
+print(client)
+# Start the background server and wait for some time
+# Note that in practice this is totally optional, as by default the server will be started when the first annotation is performed
+client.start()
+#import time; time.sleep(10)
+# ************************************************************************
+# ***************************** TGRL Parsing *****************************
+def parse_tgrl(file_obj):
+  with open(file_obj.name, 'r') as f:
+    tgrl_text = f.read()
+    tgrl_text = tgrl_text.replace('\t', '')
+    tgrl_text = tgrl_text.replace('\n', '')
+  return tgrl_text
+def extract_elements(tgrl_text):
+  # Extract actors
+  actors = re.findall("(?:.*?actor\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s-]*)(?:\")", tgrl_text)
+  # Extract goals
+  goals = re.findall("(?:.*?goal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s]*)(?:\")", tgrl_text)
+  # Extract softGoals
+  softGoals = re.findall("(?:.*?softGoal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s]*)(?:\")", tgrl_text)
+  # Extract tasks
+  tasks = re.findall("(?:.*?task\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s]*)(?:\")", tgrl_text)
+  # Extract resources
+  resources = re.findall("(?:.*?resource\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s]*)(?:\")", tgrl_text)
+  elements = {
+    "actors": actors,
+    "goals": goals,
+    "softGoals": softGoals,
+    "tasks": tasks,
+    "resources": resources
+  }
+  # get elements per actor
+  elements_per_actor = {}
+  for goal in goals:
+    corresponding_actor = tgrl_text.rfind('actor', 0, tgrl_text.index(goal))
+    corresponding_actor = re.split(' |{', tgrl_text[corresponding_actor:])[1]
+    if corresponding_actor not in elements_per_actor:
+        elements_per_actor[corresponding_actor] = []
+    elements_per_actor[corresponding_actor].append(goal)
+  for softGoal in softGoals:
+    corresponding_actor = tgrl_text.rfind('actor', 0, tgrl_text.index(softGoal))
+    corresponding_actor = re.split(' |{', tgrl_text[corresponding_actor:])[1]
+    if corresponding_actor not in elements_per_actor:
+        elements_per_actor[corresponding_actor] = []
+    elements_per_actor[corresponding_actor].append(softGoal)
+  for task in tasks:
+    corresponding_actor = tgrl_text.rfind('actor', 0, tgrl_text.index(task))
+    corresponding_actor = re.split(' |{', tgrl_text[corresponding_actor:])[1]
+    if corresponding_actor not in elements_per_actor:
+        elements_per_actor[corresponding_actor] = []
+    elements_per_actor[corresponding_actor].append(task)
+  # get decomposed elements
+  new_lines = tgrl_text
+  decomposed_elements = {}
+  main_elements = re.findall("\w+(?=\s+decomposedBy)", new_lines)
+  for main_element in main_elements:
+      sub_elements = []
+      sub_element = (re.findall(main_element+"(?: decomposedBy )([A-Za-z\s]*)", new_lines)[0])
+      sub_elements.append(sub_element)
+      new_lines = new_lines.replace(sub_element+', ', '')
+      temp = main_element + " decomposedBy "
+      for idx, sub_element in enumerate(sub_elements):
+          if idx+1 == len (sub_elements):
+              temp = temp + sub_element + ";"
+          else:
+              temp = temp + sub_element + ", "
+      while temp not in tgrl_text:
+          sub_element = (re.findall(main_element+"(?: decomposedBy )([A-Za-z\s]*)", new_lines)[0])
+          sub_elements.append(sub_element)
+          new_lines = new_lines.replace(sub_element+', ', '')
+          temp = main_element + " decomposedBy "
+          for idx, sub_element in enumerate(sub_elements):
+              if idx+1 == len (sub_elements):
+                  temp = temp + sub_element + ";"
+              else:
+                  temp = temp + sub_element + ", "
+      decomposed_elements[main_element] = sub_elements
+      # Replace elements IDs with names
+      new_decomposed_elements = {}
+      for key, _ in decomposed_elements.items():
+          new_key = re.findall("(?:"+key+" {\s*name\s=\s\")([A-Za-z\s]*)", tgrl_text)[0]
+          new_values = []
+          for element in decomposed_elements[key]:
+              new_value = re.findall("(?:"+element+" {\s*name\s=\s\")([A-Za-z\s]*)", tgrl_text)[0]
+              new_values.append(new_value)
+          new_decomposed_elements[new_key] = new_values
+  return elements, elements_per_actor, new_decomposed_elements
+# ************************************************************************
+# ************************* Bad Smells Detection *************************
+# ########### Long Elements ###########
+def get_long_elements(elements): # Using RegEx
+  long_elements = []
+  for key, value in elements.items():
+    for i in range(0, len(elements[key])):
+        if len(re. findall(r'\w+', elements[key][i])) > 4:
+            long_elements.append(elements[key][i])
+  if long_elements:
+    long_elements = "\n".join(long_elements)
+    return "Long elements:\n" + long_elements
+  else:
+    return "Long elements:\nNone."
+# #####################################
+# ######### Complex Sentences #########
+# Complex sentences
+def get_verb_phrases(t):
+    verb_phrases = []
+    num_children = len(t)
+    num_VP = sum(1 if t[i].label() == "VP" else 0 for i in range(0, num_children))
+    if t.label() != "VP":
+        for i in range(0, num_children):
+            if t[i].height() > 2:
+                verb_phrases.extend(get_verb_phrases(t[i]))
+    elif t.label() == "VP" and num_VP > 1:
+        for i in range(0, num_children):
+            if t[i].label() == "VP":
+                if t[i].height() > 2:
+                    verb_phrases.extend(get_verb_phrases(t[i]))
+    else:
+        verb_phrases.append(' '.join(t.leaves()))
+    return verb_phrases
+def get_pos(t):
+    vp_pos = []
+    sub_conj_pos = []
+    num_children = len(t)
+    children = [t[i].label() for i in range(0,num_children)]
+    flag = re.search(r"(S|SBAR|SBARQ|SINV|SQ)", ' '.join(children))
+    if "VP" in children and not flag:
+        for i in range(0, num_children):
+            if t[i].label() == "VP":
+                vp_pos.append(t[i].treeposition())
+    elif not "VP" in children and not flag:
+        for i in range(0, num_children):
+            if t[i].height() > 2:
+                temp1,temp2 = get_pos(t[i])
+                vp_pos.extend(temp1)
+                sub_conj_pos.extend(temp2)
+    # comment this "else" part, if want to include subordinating conjunctions
+    else:
+        for i in range(0, num_children):
+            if t[i].label() in ["S","SBAR","SBARQ","SINV","SQ"]:
+                temp1, temp2 = get_pos(t[i])
+                vp_pos.extend(temp1)
+                sub_conj_pos.extend(temp2)
+            else:
+                sub_conj_pos.append(t[i].treeposition())
+    return (vp_pos,sub_conj_pos)
+  # get all clauses
+def get_clause_list(sent):
+    parser = client.annotate(sent, properties={"annotators":"parse","outputFormat": "json"})
+    sent_tree = nltk.tree.ParentedTree.fromstring(parser["sentences"][0]["parse"])
+    #print(sent_tree)
+    clause_level_list = ["S","SBAR","SBARQ","SINV","SQ"]
+    clause_list = []
+    sub_trees = []
+    #sent_tree.pretty_print()
+    # break the tree into subtrees of clauses using
+    # clause levels "S","SBAR","SBARQ","SINV","SQ"
+    for sub_tree in reversed(list(sent_tree.subtrees())):
+        if sub_tree.label() in clause_level_list:
+            if sub_tree.parent().label() in clause_level_list:
+                continue
+            if (len(sub_tree) == 1 and sub_tree.label() == "S" and sub_tree[0].label() == "VP"
+                and not sub_tree.parent().label() in clause_level_list):
+                continue
+            sub_trees.append(sub_tree)
+            del sent_tree[sub_tree.treeposition()]
+    #print(sub_trees)
+    # for each clause level subtree, extract relevant simple sentence
+    for t in sub_trees:
+        # get verb phrases from the new modified tree
+        verb_phrases = get_verb_phrases(t)
+        #print(verb_phrases)
+        # get tree without verb phrases (mainly subject)
+        # remove subordinating conjunctions
+        vp_pos,sub_conj_pos = get_pos(t)
+        for i in vp_pos:
+            del t[i]
+        for i in sub_conj_pos:
+            del t[i]
+        subject_phrase = ' '.join(t.leaves())
+        # update the clause_list
+        for i in verb_phrases:
+            clause_list.append(subject_phrase + " " + i)
+    return clause_list
+def get_complex_sentences(elements):
+  complex_sentences = []
+  for key, value in elements.items():
+      for i in range(0, len(elements[key])):
+          if len(get_clause_list(re.sub(r"(\.|,|\?|\(|\)|\[|\])"," ", elements[key][i]))) > 1:
+              complex_sentences.append(elements[key][i])
+  if complex_sentences:
+    complex_sentences = "\n".join(complex_sentences)
+    return "Complex sentences:\n" + complex_sentences
+  else:
+    return "Complex sentences:\nNone."
+# #################################
+# ########## Punctuations #########
+def get_punctuations(elements):
+  punctuations = []
+  for key, value in elements.items():
+      for i in range(0, len(elements[key])):
+          if len(re.findall("[^\s\w\d-]", elements[key][i])) > 0:
+              punctuations.append(elements[key][i])
+  if punctuations:
+    punctuations = "\n".join(punctuations)
+    return "Punctuations:\n" + punctuations
+  else:
+    return "Punctuations:\nNone."
+# #################################
+# ########## Incorrect Actor Syntax ##########
+def find_non_NPs(sentences):
+  model_name = "QCRI/bert-base-multilingual-cased-pos-english"
+  tokenizer = AutoTokenizer.from_pretrained(model_name)
+  model = AutoModelForTokenClassification.from_pretrained(model_name)
+  pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer)
+  outputs = pipeline(sentences)
+  Non_NPs = []
+  for idx, output in enumerate(outputs):
+    if not output[0]['entity'].startswith('N'):
+      Non_NPs.append(sentences[idx])
+  return Non_NPs
+def check_actor_syntax(actors):
+  incorrect_actor_syntax = find_non_NPs(actors)
+  if incorrect_actor_syntax:
+    incorrect_actor_syntax = "\n".join(incorrect_actor_syntax)
+    return "Incorrect Actors Syntax:\n" + incorrect_actor_syntax
+  else:
+    return "All actors are syntactically correct."
+# ############################################
+# ########## Incorrect Goal Syntax ###########
+def check_goal_syntax(goals):
+  incorrect_goal_syntax = find_non_NPs(goals)
+  if incorrect_goal_syntax:
+    incorrect_goal_syntax = "\n".join(incorrect_goal_syntax)
+    return "Incorrect Goals Syntax:\n" + incorrect_goal_syntax
+  else:
+    return "All goals are syntactically correct."
+# ############################################
+# ########## Incorrect Softgoal Syntax ###########
+def check_softgoal_syntax(softgoals):
+  incorrect_softgoal_syntax = find_non_NPs(softgoals)
+  if incorrect_softgoal_syntax:
+    incorrect_softgoal_syntax = "\n".join(incorrect_softgoal_syntax)
+    return "Incorrect Softgoals Syntax:\n" + incorrect_softgoal_syntax
+  else:
+    return "All softgoal are syntactically correct."
+# ############################################
+# ########## Incorrect Task Syntax ###########
+def find_non_VPs(sentences):
+  model_name = "QCRI/bert-base-multilingual-cased-pos-english"
+  tokenizer = AutoTokenizer.from_pretrained(model_name)
+  model = AutoModelForTokenClassification.from_pretrained(model_name)
+  pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer)
+  outputs = pipeline(sentences)
+  Non_VPs = []
+  for idx, output in enumerate(outputs):
+    if not output[0]['entity'].startswith('V'):
+      Non_VPs.append(sentences[idx])
+  return Non_VPs
+def check_task_syntax(tasks):
+  incorrect_task_syntax = find_non_VPs(tasks)
+  if incorrect_task_syntax:
+    incorrect_task_syntax = "\n".join(incorrect_task_syntax)
+    return "Incorrect Tasks Syntax:\n" + incorrect_task_syntax
+  else:
+    return "All tasks are syntactically correct.""
+# ############################################
+# ########## Similarity ###########
+def get_similar_elements(elements_per_actor):
+  # Load the pre-trained model
+  model = CrossEncoder('cross-encoder/stsb-roberta-base')
+  # Prepare sentence pair array
+  sentence_pairs = []
+  for key, value in elements_per_actor.items():
+      for i in range(len(elements_per_actor[key])):
+          for j in range(i+1,len(elements_per_actor[key])):
+              sentence_pairs.append([elements_per_actor[key][i], elements_per_actor[key][j]])
+  # Predict semantic similarity
+  semantic_similarity_scores = model.predict(sentence_pairs, show_progress_bar=True)
+  similar_elements = []
+  for index, value in enumerate(sentence_pairs):
+    if semantic_similarity_scores[index] > 0.5:
+      similar_elements.append(value)
+      #semantic_similarity["pair_"+str(index+1)] = [value,semantic_similarity_scores[index]]
+  if similar_elements:
+    similar_elements = [' and '.join(ele) for ele in similar_elements]
+    similar_elements = "\n".join(similar_elements)
+    return "The following elements are semantically similar:\n" + similar_elements
+  else:
+    return "There are no similar elements."
+  return semantic_similarity
+# #################################
+# ########## Misspelling ###########
+def get_misspelled_words(sentence):
+  spell = Speller(only_replacements=True)
+  misspelled= []
+  for word in sentence.split():
+      correct_word = spell(word)
+      if word != correct_word:
+          misspelled.append([word, correct_word])
+  return misspelled
+def check_spelling(elements):
+  spelling_mistakes = []
+  spelling_mistakes_string = ""
+  for key, value in elements.items():
+    for i in range(0, len(elements[key])):
+        if get_misspelled_words(elements[key][i]):
+            spelling_mistakes.append([elements[key][i], get_misspelled_words(elements[key][i])])
+  for idx, element in enumerate(spelling_mistakes):
+    for spelling_mistake in element[1]:
+      temp = ' should be written as '.join(spelling_mistake)
+      spelling_mistakes_string = spelling_mistakes_string + "\n" + element[0] + ": " + temp
+  return spelling_mistakes_string
+# ##################################
+# ########## NLI ###########
+def do_nli(premise, hypothesis, model, tokenizer):
+  # Tokenization
+  token_ids = []
+  seg_ids = []
+  mask_ids = []
+  premise_id = tokenizer.encode(premise, add_special_tokens = False)
+  hypothesis_id = tokenizer.encode(hypothesis, add_special_tokens = False)
+  pair_token_ids = [tokenizer.cls_token_id] + premise_id + [tokenizer.sep_token_id] + hypothesis_id + [tokenizer.sep_token_id]
+  premise_len = len(premise_id)
+  hypothesis_len = len(hypothesis_id)
+  segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1))  # sentence 0 and sentence 1
+  attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3))  # mask padded values
+  token_ids.append(torch.tensor(pair_token_ids))
+  seg_ids.append(segment_ids)
+  mask_ids.append(attention_mask_ids)
+  # Forward pass
+  token_ids = pad_sequence(token_ids, batch_first=True)
+  mask_ids = pad_sequence(mask_ids, batch_first=True)
+  seg_ids = pad_sequence(seg_ids, batch_first=True)
+  with torch.no_grad():
+    output = model(token_ids,
+                  token_type_ids=seg_ids,
+                  attention_mask=mask_ids)
+  # Output predication
+  result = ""
+  prediction = np.argmax(output.logits.cpu().numpy()).flatten().item()
+  if prediction == 0:
+    result = "Entailment"
+    #print("Entailment")
+  elif prediction == 1:
+    result = "Contradiction"
+    #print("Contradiction")
+  elif prediction == 2:
+    result = "Neutral"
+    #print("Neutral")
+  return result
+# Entailment
+def check_entailment(decomposed_elements):
+  model = BertForSequenceClassification.from_pretrained("nouf-sst/bert-base-MultiNLI", use_auth_token="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")
+  tokenizer = BertTokenizer.from_pretrained("nouf-sst/bert-base-MultiNLI", use_auth_token="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", do_lower_case=True)
+  sentence_pairs = []
+  non_matching_elements = []
+  for key, value in decomposed_elements.items():
+      #print(key, value)
+      for i in decomposed_elements[key]:
+          #print(key, i)
+          sentence_pairs.append([key, i])
+  for sentence_pair in sentence_pairs:
+    result = do_nli(sentence_pair[0], sentence_pair[1], model, tokenizer)
+    print(result)
+    if result != "Entailment":
+      non_matching_elements.append(sentence_pair)
+  if non_matching_elements:
+    non_matching_elements = [' and '.join(ele) for ele in non_matching_elements]
+    non_matching_elements = "\n".join(non_matching_elements)
+    return "The following elements are miss matching:\n" + non_matching_elements
+  else:
+    return "There are no miss matched elements."
+  return result
+# Contradiction
+def check_contradiction(elements_per_actor):
+  model = BertForSequenceClassification.from_pretrained("nouf-sst/bert-base-MultiNLI", use_auth_token="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")
+  tokenizer = BertTokenizer.from_pretrained("nouf-sst/bert-base-MultiNLI", use_auth_token="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", do_lower_case=True)
+  sentence_pairs = []
+  contradicting_elements = []
+  for key, value in elements_per_actor.items():
+      for i in range(len(elements_per_actor[key])):
+          for j in range(i+1,len(elements_per_actor[key])):
+              sentence_pairs.append([elements_per_actor[key][i], elements_per_actor[key][j]])
+  #print(sentence_pairs)
+  # Check contradiction
+  for sentence_pair in sentence_pairs:
+    result = do_nli(sentence_pair[0], sentence_pair[1], model, tokenizer)
+    #print(result)
+    if result == "Contradiction":
+      contradicting_elements.append(sentence_pair)
+  if contradicting_elements:
+    contradicting_elements = [' and '.join(ele) for ele in contradicting_elements]
+    contradicting_elements = "\n".join(contradicting_elements)
+    return "The following elements are contradicting:\n" + contradicting_elements
+  else:
+    return "There are no contradicting elements."
+# ##########################
+# ************************* User Interface *************************
+def identify_bad_smells(tgrl_file, selected_bad_smells):
+  output = ""
+  tgrl_text = parse_tgrl(tgrl_file)
+  elements, elements_per_actor, decomposed_elements = extract_elements(tgrl_text)
+  if 'Size' in selected_bad_smells:
+    output = output + get_long_elements(elements) + "\n\n"
+  if 'Complexity' in selected_bad_smells:
+    output = output + get_complex_sentences(elements) + "\n\n"
+  if 'Punctuations' in selected_bad_smells:
+    output = output + get_punctuations(elements) + "\n\n"
+  if 'Actors Syntax' in selected_bad_smells:
+    output = output + check_actor_syntax(elements['actors']) + "\n\n"
+  if 'Goals Syntax' in selected_bad_smells:
+    output = output + check_goal_syntax(elements['goals']) + "\n\n"
+  if 'Softgoals Syntax' in selected_bad_smells:
+    output = output + check_softgoal_syntax(elements['softGoals']) + "\n\n"
+  if 'Tasks Syntax' in selected_bad_smells:
+    output = output + check_task_syntax(elements['tasks']) + "\n\n"
+  if 'Similar Elements' in selected_bad_smells:
+    output = output + get_similar_elements(elements_per_actor) + "\n\n"
+  if 'Spelling Mistakes' in selected_bad_smells:
+    output = output + check_spelling(elements) + "\n\n"
+  if 'Goal-Subgoal Mismatch' in selected_bad_smells:
+    output = output + check_entailment(decomposed_elements) + "\n\n"
+  if 'Contradicting Elements' in selected_bad_smells:
+    output = output + check_contradiction(elements_per_actor) + "\n\n"
+  return output
+interface = gr.Interface(fn = identify_bad_smells,
+                         inputs = [gr.File(label="TGRL File"),
+                          gr.CheckboxGroup(["Size", "Complexity", "Punctuations", "Actors Syntax", "Goals Syntax", "Softgoals Syntax", "Tasks Syntax", "Similar Elements", "Spelling Mistakes", "Goal-Subgoal Mismatch", "Contradicting Elements"],
+                                           label="Which bad smells you want to detect?")],
+                         outputs = ["text"],
+                         title = "TGRL Bad Smells Detection",
+                         description = "Upload your .xgrl file and we will find the bad smells for you!")
+interface.launch(inline = False)
+#interface.launch()