Spaces:

nouf-sst
/

TGRL-bad-smells

Runtime error

App Files Files Community

nouf-sst commited on Apr 14, 2023

Commit

301fee0

1 Parent(s): 170f1c0

Fix parsing and complexity functions

Browse files

Files changed (1) hide show

app.py +22 -81

app.py CHANGED Viewed

@@ -1,19 +1,19 @@
 import gradio as gr
 import re
 import json
 import nltk
 from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
 from sentence_transformers import CrossEncoder
 from autocorrect import Speller
 from transformers import BertTokenizer, BertForSequenceClassification
 import torch
 from torch.nn.utils.rnn import pad_sequence
-import numpy as np
-import spacy
-import en_core_web_sm
 # ***************************** Load needed models *****************************
-nlp = spacy.load('en_core_web_sm')
 pos_tokenizer = AutoTokenizer.from_pretrained("QCRI/bert-base-multilingual-cased-pos-english")
 pos_model = AutoModelForTokenClassification.from_pretrained("QCRI/bert-base-multilingual-cased-pos-english")
 sentences_similarity_model = CrossEncoder('cross-encoder/stsb-roberta-base')
@@ -34,15 +34,15 @@ def parse_tgrl(file_obj):
 def extract_elements(tgrl_text):
   # Extract actors
-  actors = re.findall("(?:.*?actor\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s-]*)(?:\")", tgrl_text)
   # Extract goals
-  goals = re.findall("(?:.*?goal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s]*)(?:\")", tgrl_text)
   # Extract softGoals
-  softGoals = re.findall("(?:.*?softGoal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s]*)(?:\")", tgrl_text)
   # Extract tasks
-  tasks = re.findall("(?:.*?task\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s]*)(?:\")", tgrl_text)
   # Extract resources
-  resources = re.findall("(?:.*?resource\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s]*)(?:\")", tgrl_text)
   elements = {
     "actors": actors,
@@ -151,13 +151,24 @@ def get_long_elements(elements, size_threshold): # Using RegEx
 # #####################################
 # ######### Complex Sentences #########
 def get_complex_sentences(elements):
   complex_sentences = []
   for key, value in elements.items():
       for i in range(0, len(elements[key])):
-          if len(get_clauses_list(elements[key][i])) > 1:
               complex_sentences.append(elements[key][i])
   if complex_sentences:
@@ -166,76 +177,6 @@ def get_complex_sentences(elements):
   else:
     return "Complex sentences:\nNone."
-def find_root_of_sentence(doc):
-    root_token = None
-    for token in doc:
-      if (token.dep_ == "ROOT"):
-        root_token = token
-    return root_token
-def find_other_verbs(doc, root_token):
-  other_verbs = []
-  for token in doc:
-    ancestors = list(token.ancestors)
-    if (token.pos_ == "VERB" and len(ancestors) == 1 and ancestors[0] == root_token):
-      other_verbs.append(token)
-  return other_verbs
-#  find the token spans for each verb
-def get_clause_token_span_for_verb(verb, doc, all_verbs):
-    first_token_index = len(doc)
-    last_token_index = 0
-    this_verb_children = list(verb.children)
-    for child in this_verb_children:
-        if (child not in all_verbs):
-            if (child.i < first_token_index):
-                first_token_index = child.i
-            if (child.i > last_token_index):
-                last_token_index = child.i
-    return(first_token_index, last_token_index)
-def get_clauses_list(sent):
-  doc = nlp(sent)
-  # find part of speech, dependency tag, ancestors, and children of each token
-  for token in doc:
-    ancestors = [t.text for t in token.ancestors]
-    children = [t.text for t in token.children]
-    #print(token.text, "\t", token.i, "\t", token.pos_, "\t", token.dep_, "\t", ancestors, "\t", children)
-  # find the root token of the sentenc
-  root_token = find_root_of_sentence(doc)
-  #  find the other verbs
-  other_verbs = find_other_verbs(doc, root_token)
-  # put together all the verbs in one array and process each using get_clause_token_span_for_verb function
-  # this will return a tuple of start and end indices for each verb's clause
-  token_spans = []
-  all_verbs = [root_token] + other_verbs
-  for other_verb in all_verbs:
-      (first_token_index, last_token_index) = \
-      get_clause_token_span_for_verb(other_verb,
-                                      doc, all_verbs)
-      token_spans.append((first_token_index,
-                          last_token_index))
-  # put together token spans for each clause
-  sentence_clauses = []
-  for token_span in token_spans:
-      start = token_span[0]
-      end = token_span[1]
-      if (start < end):
-          clause = doc[start:end]
-          sentence_clauses.append(clause)
-  sentence_clauses = sorted(sentence_clauses, key=lambda tup: tup[0])
-  # get the final result
-  clauses_text = [clause.text for clause in sentence_clauses]
-  #print(clauses_text)
-  return clauses_text
 # #####################################
 # ########## Punctuations #########

 import gradio as gr
 import re
 import json
+import numpy as np
 import nltk
+import stanza
+from stanza.models.constituency.parse_tree import Tree
 from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
 from sentence_transformers import CrossEncoder
 from autocorrect import Speller
 from transformers import BertTokenizer, BertForSequenceClassification
 import torch
 from torch.nn.utils.rnn import pad_sequence
 # ***************************** Load needed models *****************************
+nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')
 pos_tokenizer = AutoTokenizer.from_pretrained("QCRI/bert-base-multilingual-cased-pos-english")
 pos_model = AutoModelForTokenClassification.from_pretrained("QCRI/bert-base-multilingual-cased-pos-english")
 sentences_similarity_model = CrossEncoder('cross-encoder/stsb-roberta-base')
 def extract_elements(tgrl_text):
   # Extract actors
+  actors = re.findall("(?:.*?actor\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)
   # Extract goals
+  goals = re.findall("(?:.*?goal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)
   # Extract softGoals
+  softGoals = re.findall("(?:.*?softGoal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)
   # Extract tasks
+  tasks = re.findall("(?:.*?task\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)
   # Extract resources
+  resources = re.findall("(?:.*?resource\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)
   elements = {
     "actors": actors,
 # #####################################
 # ######### Complex Sentences #########
+def is_complex_sentence(sentence):
+  nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')
+  doc = nlp(sentence)
+  for sentence in doc.sentences:
+      unique_constituent_labels = Tree.get_unique_constituent_labels(sentence.constituency)
+      if 'SBAR' in unique_constituent_labels:
+        return True
+      else:
+        return False
 def get_complex_sentences(elements):
   complex_sentences = []
   for key, value in elements.items():
       for i in range(0, len(elements[key])):
+          if is_complex_sentence(elements[key][i]):
               complex_sentences.append(elements[key][i])
   if complex_sentences:
   else:
     return "Complex sentences:\nNone."
 # #####################################
 # ########## Punctuations #########