Spaces:
Runtime error
Runtime error
Fix parsing and complexity functions
Browse files
app.py
CHANGED
@@ -1,19 +1,19 @@
|
|
1 |
import gradio as gr
|
2 |
import re
|
3 |
import json
|
|
|
4 |
import nltk
|
|
|
|
|
5 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
|
6 |
from sentence_transformers import CrossEncoder
|
7 |
from autocorrect import Speller
|
8 |
from transformers import BertTokenizer, BertForSequenceClassification
|
9 |
import torch
|
10 |
from torch.nn.utils.rnn import pad_sequence
|
11 |
-
import numpy as np
|
12 |
-
import spacy
|
13 |
-
import en_core_web_sm
|
14 |
|
15 |
# ***************************** Load needed models *****************************
|
16 |
-
nlp =
|
17 |
pos_tokenizer = AutoTokenizer.from_pretrained("QCRI/bert-base-multilingual-cased-pos-english")
|
18 |
pos_model = AutoModelForTokenClassification.from_pretrained("QCRI/bert-base-multilingual-cased-pos-english")
|
19 |
sentences_similarity_model = CrossEncoder('cross-encoder/stsb-roberta-base')
|
@@ -34,15 +34,15 @@ def parse_tgrl(file_obj):
|
|
34 |
def extract_elements(tgrl_text):
|
35 |
|
36 |
# Extract actors
|
37 |
-
actors = re.findall("(?:.*?actor\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s
|
38 |
# Extract goals
|
39 |
-
goals = re.findall("(?:.*?goal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s]*)(?:\")", tgrl_text)
|
40 |
# Extract softGoals
|
41 |
-
softGoals = re.findall("(?:.*?softGoal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s]*)(?:\")", tgrl_text)
|
42 |
# Extract tasks
|
43 |
-
tasks = re.findall("(?:.*?task\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s]*)(?:\")", tgrl_text)
|
44 |
# Extract resources
|
45 |
-
resources = re.findall("(?:.*?resource\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s]*)(?:\")", tgrl_text)
|
46 |
|
47 |
elements = {
|
48 |
"actors": actors,
|
@@ -151,13 +151,24 @@ def get_long_elements(elements, size_threshold): # Using RegEx
|
|
151 |
# #####################################
|
152 |
|
153 |
# ######### Complex Sentences #########
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
def get_complex_sentences(elements):
|
155 |
|
156 |
complex_sentences = []
|
157 |
-
|
158 |
for key, value in elements.items():
|
159 |
for i in range(0, len(elements[key])):
|
160 |
-
if
|
161 |
complex_sentences.append(elements[key][i])
|
162 |
|
163 |
if complex_sentences:
|
@@ -166,76 +177,6 @@ def get_complex_sentences(elements):
|
|
166 |
else:
|
167 |
return "Complex sentences:\nNone."
|
168 |
|
169 |
-
def find_root_of_sentence(doc):
|
170 |
-
root_token = None
|
171 |
-
for token in doc:
|
172 |
-
if (token.dep_ == "ROOT"):
|
173 |
-
root_token = token
|
174 |
-
return root_token
|
175 |
-
|
176 |
-
def find_other_verbs(doc, root_token):
|
177 |
-
other_verbs = []
|
178 |
-
for token in doc:
|
179 |
-
ancestors = list(token.ancestors)
|
180 |
-
if (token.pos_ == "VERB" and len(ancestors) == 1 and ancestors[0] == root_token):
|
181 |
-
other_verbs.append(token)
|
182 |
-
return other_verbs
|
183 |
-
|
184 |
-
# find the token spans for each verb
|
185 |
-
def get_clause_token_span_for_verb(verb, doc, all_verbs):
|
186 |
-
first_token_index = len(doc)
|
187 |
-
last_token_index = 0
|
188 |
-
this_verb_children = list(verb.children)
|
189 |
-
for child in this_verb_children:
|
190 |
-
if (child not in all_verbs):
|
191 |
-
if (child.i < first_token_index):
|
192 |
-
first_token_index = child.i
|
193 |
-
if (child.i > last_token_index):
|
194 |
-
last_token_index = child.i
|
195 |
-
return(first_token_index, last_token_index)
|
196 |
-
|
197 |
-
def get_clauses_list(sent):
|
198 |
-
|
199 |
-
doc = nlp(sent)
|
200 |
-
|
201 |
-
# find part of speech, dependency tag, ancestors, and children of each token
|
202 |
-
for token in doc:
|
203 |
-
ancestors = [t.text for t in token.ancestors]
|
204 |
-
children = [t.text for t in token.children]
|
205 |
-
#print(token.text, "\t", token.i, "\t", token.pos_, "\t", token.dep_, "\t", ancestors, "\t", children)
|
206 |
-
|
207 |
-
# find the root token of the sentenc
|
208 |
-
root_token = find_root_of_sentence(doc)
|
209 |
-
|
210 |
-
# find the other verbs
|
211 |
-
other_verbs = find_other_verbs(doc, root_token)
|
212 |
-
|
213 |
-
# put together all the verbs in one array and process each using get_clause_token_span_for_verb function
|
214 |
-
# this will return a tuple of start and end indices for each verb's clause
|
215 |
-
token_spans = []
|
216 |
-
all_verbs = [root_token] + other_verbs
|
217 |
-
for other_verb in all_verbs:
|
218 |
-
(first_token_index, last_token_index) = \
|
219 |
-
get_clause_token_span_for_verb(other_verb,
|
220 |
-
doc, all_verbs)
|
221 |
-
token_spans.append((first_token_index,
|
222 |
-
last_token_index))
|
223 |
-
|
224 |
-
# put together token spans for each clause
|
225 |
-
sentence_clauses = []
|
226 |
-
for token_span in token_spans:
|
227 |
-
start = token_span[0]
|
228 |
-
end = token_span[1]
|
229 |
-
if (start < end):
|
230 |
-
clause = doc[start:end]
|
231 |
-
sentence_clauses.append(clause)
|
232 |
-
sentence_clauses = sorted(sentence_clauses, key=lambda tup: tup[0])
|
233 |
-
|
234 |
-
# get the final result
|
235 |
-
clauses_text = [clause.text for clause in sentence_clauses]
|
236 |
-
#print(clauses_text)
|
237 |
-
return clauses_text
|
238 |
-
|
239 |
# #####################################
|
240 |
|
241 |
# ########## Punctuations #########
|
|
|
1 |
import gradio as gr
|
2 |
import re
|
3 |
import json
|
4 |
+
import numpy as np
|
5 |
import nltk
|
6 |
+
import stanza
|
7 |
+
from stanza.models.constituency.parse_tree import Tree
|
8 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
|
9 |
from sentence_transformers import CrossEncoder
|
10 |
from autocorrect import Speller
|
11 |
from transformers import BertTokenizer, BertForSequenceClassification
|
12 |
import torch
|
13 |
from torch.nn.utils.rnn import pad_sequence
|
|
|
|
|
|
|
14 |
|
15 |
# ***************************** Load needed models *****************************
|
16 |
+
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')
|
17 |
pos_tokenizer = AutoTokenizer.from_pretrained("QCRI/bert-base-multilingual-cased-pos-english")
|
18 |
pos_model = AutoModelForTokenClassification.from_pretrained("QCRI/bert-base-multilingual-cased-pos-english")
|
19 |
sentences_similarity_model = CrossEncoder('cross-encoder/stsb-roberta-base')
|
|
|
34 |
def extract_elements(tgrl_text):
|
35 |
|
36 |
# Extract actors
|
37 |
+
actors = re.findall("(?:.*?actor\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)
|
38 |
# Extract goals
|
39 |
+
goals = re.findall("(?:.*?goal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)
|
40 |
# Extract softGoals
|
41 |
+
softGoals = re.findall("(?:.*?softGoal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)
|
42 |
# Extract tasks
|
43 |
+
tasks = re.findall("(?:.*?task\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)
|
44 |
# Extract resources
|
45 |
+
resources = re.findall("(?:.*?resource\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)
|
46 |
|
47 |
elements = {
|
48 |
"actors": actors,
|
|
|
151 |
# #####################################
|
152 |
|
153 |
# ######### Complex Sentences #########
|
154 |
+
def is_complex_sentence(sentence):
|
155 |
+
|
156 |
+
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')
|
157 |
+
doc = nlp(sentence)
|
158 |
+
for sentence in doc.sentences:
|
159 |
+
unique_constituent_labels = Tree.get_unique_constituent_labels(sentence.constituency)
|
160 |
+
if 'SBAR' in unique_constituent_labels:
|
161 |
+
return True
|
162 |
+
else:
|
163 |
+
return False
|
164 |
+
|
165 |
def get_complex_sentences(elements):
|
166 |
|
167 |
complex_sentences = []
|
168 |
+
|
169 |
for key, value in elements.items():
|
170 |
for i in range(0, len(elements[key])):
|
171 |
+
if is_complex_sentence(elements[key][i]):
|
172 |
complex_sentences.append(elements[key][i])
|
173 |
|
174 |
if complex_sentences:
|
|
|
177 |
else:
|
178 |
return "Complex sentences:\nNone."
|
179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
# #####################################
|
181 |
|
182 |
# ########## Punctuations #########
|