nouf-sst commited on
Commit
301fee0
·
1 Parent(s): 170f1c0

Fix parsing and complexity functions

Browse files
Files changed (1) hide show
  1. app.py +22 -81
app.py CHANGED
@@ -1,19 +1,19 @@
1
  import gradio as gr
2
  import re
3
  import json
 
4
  import nltk
 
 
5
  from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
6
  from sentence_transformers import CrossEncoder
7
  from autocorrect import Speller
8
  from transformers import BertTokenizer, BertForSequenceClassification
9
  import torch
10
  from torch.nn.utils.rnn import pad_sequence
11
- import numpy as np
12
- import spacy
13
- import en_core_web_sm
14
 
15
  # ***************************** Load needed models *****************************
16
- nlp = spacy.load('en_core_web_sm')
17
  pos_tokenizer = AutoTokenizer.from_pretrained("QCRI/bert-base-multilingual-cased-pos-english")
18
  pos_model = AutoModelForTokenClassification.from_pretrained("QCRI/bert-base-multilingual-cased-pos-english")
19
  sentences_similarity_model = CrossEncoder('cross-encoder/stsb-roberta-base')
@@ -34,15 +34,15 @@ def parse_tgrl(file_obj):
34
  def extract_elements(tgrl_text):
35
 
36
  # Extract actors
37
- actors = re.findall("(?:.*?actor\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s-]*)(?:\")", tgrl_text)
38
  # Extract goals
39
- goals = re.findall("(?:.*?goal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s]*)(?:\")", tgrl_text)
40
  # Extract softGoals
41
- softGoals = re.findall("(?:.*?softGoal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s]*)(?:\")", tgrl_text)
42
  # Extract tasks
43
- tasks = re.findall("(?:.*?task\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s]*)(?:\")", tgrl_text)
44
  # Extract resources
45
- resources = re.findall("(?:.*?resource\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s]*)(?:\")", tgrl_text)
46
 
47
  elements = {
48
  "actors": actors,
@@ -151,13 +151,24 @@ def get_long_elements(elements, size_threshold): # Using RegEx
151
  # #####################################
152
 
153
  # ######### Complex Sentences #########
 
 
 
 
 
 
 
 
 
 
 
154
  def get_complex_sentences(elements):
155
 
156
  complex_sentences = []
157
-
158
  for key, value in elements.items():
159
  for i in range(0, len(elements[key])):
160
- if len(get_clauses_list(elements[key][i])) > 1:
161
  complex_sentences.append(elements[key][i])
162
 
163
  if complex_sentences:
@@ -166,76 +177,6 @@ def get_complex_sentences(elements):
166
  else:
167
  return "Complex sentences:\nNone."
168
 
169
- def find_root_of_sentence(doc):
170
- root_token = None
171
- for token in doc:
172
- if (token.dep_ == "ROOT"):
173
- root_token = token
174
- return root_token
175
-
176
- def find_other_verbs(doc, root_token):
177
- other_verbs = []
178
- for token in doc:
179
- ancestors = list(token.ancestors)
180
- if (token.pos_ == "VERB" and len(ancestors) == 1 and ancestors[0] == root_token):
181
- other_verbs.append(token)
182
- return other_verbs
183
-
184
- # find the token spans for each verb
185
- def get_clause_token_span_for_verb(verb, doc, all_verbs):
186
- first_token_index = len(doc)
187
- last_token_index = 0
188
- this_verb_children = list(verb.children)
189
- for child in this_verb_children:
190
- if (child not in all_verbs):
191
- if (child.i < first_token_index):
192
- first_token_index = child.i
193
- if (child.i > last_token_index):
194
- last_token_index = child.i
195
- return(first_token_index, last_token_index)
196
-
197
- def get_clauses_list(sent):
198
-
199
- doc = nlp(sent)
200
-
201
- # find part of speech, dependency tag, ancestors, and children of each token
202
- for token in doc:
203
- ancestors = [t.text for t in token.ancestors]
204
- children = [t.text for t in token.children]
205
- #print(token.text, "\t", token.i, "\t", token.pos_, "\t", token.dep_, "\t", ancestors, "\t", children)
206
-
207
- # find the root token of the sentenc
208
- root_token = find_root_of_sentence(doc)
209
-
210
- # find the other verbs
211
- other_verbs = find_other_verbs(doc, root_token)
212
-
213
- # put together all the verbs in one array and process each using get_clause_token_span_for_verb function
214
- # this will return a tuple of start and end indices for each verb's clause
215
- token_spans = []
216
- all_verbs = [root_token] + other_verbs
217
- for other_verb in all_verbs:
218
- (first_token_index, last_token_index) = \
219
- get_clause_token_span_for_verb(other_verb,
220
- doc, all_verbs)
221
- token_spans.append((first_token_index,
222
- last_token_index))
223
-
224
- # put together token spans for each clause
225
- sentence_clauses = []
226
- for token_span in token_spans:
227
- start = token_span[0]
228
- end = token_span[1]
229
- if (start < end):
230
- clause = doc[start:end]
231
- sentence_clauses.append(clause)
232
- sentence_clauses = sorted(sentence_clauses, key=lambda tup: tup[0])
233
-
234
- # get the final result
235
- clauses_text = [clause.text for clause in sentence_clauses]
236
- #print(clauses_text)
237
- return clauses_text
238
-
239
  # #####################################
240
 
241
  # ########## Punctuations #########
 
1
  import gradio as gr
2
  import re
3
  import json
4
+ import numpy as np
5
  import nltk
6
+ import stanza
7
+ from stanza.models.constituency.parse_tree import Tree
8
  from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
9
  from sentence_transformers import CrossEncoder
10
  from autocorrect import Speller
11
  from transformers import BertTokenizer, BertForSequenceClassification
12
  import torch
13
  from torch.nn.utils.rnn import pad_sequence
 
 
 
14
 
15
  # ***************************** Load needed models *****************************
16
+ nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')
17
  pos_tokenizer = AutoTokenizer.from_pretrained("QCRI/bert-base-multilingual-cased-pos-english")
18
  pos_model = AutoModelForTokenClassification.from_pretrained("QCRI/bert-base-multilingual-cased-pos-english")
19
  sentences_similarity_model = CrossEncoder('cross-encoder/stsb-roberta-base')
 
34
  def extract_elements(tgrl_text):
35
 
36
  # Extract actors
37
+ actors = re.findall("(?:.*?actor\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)
38
  # Extract goals
39
+ goals = re.findall("(?:.*?goal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)
40
  # Extract softGoals
41
+ softGoals = re.findall("(?:.*?softGoal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)
42
  # Extract tasks
43
+ tasks = re.findall("(?:.*?task\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)
44
  # Extract resources
45
+ resources = re.findall("(?:.*?resource\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)
46
 
47
  elements = {
48
  "actors": actors,
 
151
  # #####################################
152
 
153
  # ######### Complex Sentences #########
154
+ def is_complex_sentence(sentence):
155
+
156
+ nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')
157
+ doc = nlp(sentence)
158
+ for sentence in doc.sentences:
159
+ unique_constituent_labels = Tree.get_unique_constituent_labels(sentence.constituency)
160
+ if 'SBAR' in unique_constituent_labels:
161
+ return True
162
+ else:
163
+ return False
164
+
165
  def get_complex_sentences(elements):
166
 
167
  complex_sentences = []
168
+
169
  for key, value in elements.items():
170
  for i in range(0, len(elements[key])):
171
+ if is_complex_sentence(elements[key][i]):
172
  complex_sentences.append(elements[key][i])
173
 
174
  if complex_sentences:
 
177
  else:
178
  return "Complex sentences:\nNone."
179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  # #####################################
181
 
182
  # ########## Punctuations #########