Carlosito16 commited on
Commit
c3412a2
·
1 Parent(s): d866b07

import all functions and classes form helper_function.py

Browse files
Files changed (1) hide show
  1. app.py +59 -0
app.py CHANGED
@@ -21,6 +21,65 @@ print('Libraries called succesfully!!!!"')
21
 
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def t5_generate_summary(article_text):
25
  input_ids = t5_tokenizer(
26
  [WHITESPACE_HANDLER(article_text)],
 
21
 
22
 
23
 
24
+ class keyWordExtractor():
25
+
26
+ def __init__(self,
27
+ article_text,
28
+ similarity_model,
29
+ n_gram = 1,
30
+ top_n = 3,
31
+ french_stopwords = None,
32
+ ner= None,
33
+ ):
34
+ self.article_text = article_text
35
+ self.french_stopwords = french_stopwords
36
+ self.candidates = self.count_vectorizer(n_gram)
37
+ self.noun_candidates, self.proper_noun_candidates = self.slice_only_noun_token(ner, self.candidates)
38
+ self.top_n_keywords = self.top_n_extractor(similarity_model, top_n)
39
+
40
+ def count_vectorizer(self, n_gram):
41
+ n_gram_range = (n_gram, n_gram)
42
+ # Extract candidate words/phrases
43
+ count = CountVectorizer(ngram_range=n_gram_range,
44
+ stop_words = self.french_stopwords).fit([self.article_text]) #Main change
45
+ candidates = count.get_feature_names_out()
46
+
47
+ return candidates
48
+
49
+ def slice_only_noun_token(self, ner, token_list):
50
+ """
51
+ Given the tokenized list, this function returns only the "NOUN" token
52
+ Args:
53
+ ner (spacy): The NER class to detect the `token.pos_`
54
+ token_list (list): List of token from the full article
55
+ Returns:
56
+ slice_list (list): List of token containing only "NOUN" part of speech
57
+ """
58
+
59
+ noun_slice_list = []
60
+ proper_noun_slice_list = []
61
+ for word_idx in range(len(token_list)):
62
+ doc = ner(token_list[word_idx])
63
+
64
+ for token in doc:
65
+ if token.pos_ == 'NOUN':
66
+ noun_slice_list.append(token.text)
67
+ elif token.pos_ == 'PROPN':
68
+ proper_noun_slice_list.append(token.text)
69
+
70
+ return noun_slice_list, proper_noun_slice_list
71
+
72
+ def top_n_extractor(self, model, top_n):
73
+ doc_embedding = model.encode([self.article_text])
74
+ candidate_embeddings = model.encode(self.noun_candidates)
75
+ distances = cosine_similarity(doc_embedding, candidate_embeddings)
76
+ keywords = [self.noun_candidates[index] for index in distances.argsort()[0][-top_n:]]
77
+
78
+ return keywords
79
+
80
+ def clear_input():
81
+ return ("", "")
82
+
83
  def t5_generate_summary(article_text):
84
  input_ids = t5_tokenizer(
85
  [WHITESPACE_HANDLER(article_text)],