Spaces:
Runtime error
Runtime error
Commit
·
c3412a2
1
Parent(s):
d866b07
import all functions and classes form helper_function.py
Browse files
app.py
CHANGED
@@ -21,6 +21,65 @@ print('Libraries called succesfully!!!!"')
|
|
21 |
|
22 |
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
def t5_generate_summary(article_text):
|
25 |
input_ids = t5_tokenizer(
|
26 |
[WHITESPACE_HANDLER(article_text)],
|
|
|
21 |
|
22 |
|
23 |
|
24 |
+
class keyWordExtractor():
|
25 |
+
|
26 |
+
def __init__(self,
|
27 |
+
article_text,
|
28 |
+
similarity_model,
|
29 |
+
n_gram = 1,
|
30 |
+
top_n = 3,
|
31 |
+
french_stopwords = None,
|
32 |
+
ner= None,
|
33 |
+
):
|
34 |
+
self.article_text = article_text
|
35 |
+
self.french_stopwords = french_stopwords
|
36 |
+
self.candidates = self.count_vectorizer(n_gram)
|
37 |
+
self.noun_candidates, self.proper_noun_candidates = self.slice_only_noun_token(ner, self.candidates)
|
38 |
+
self.top_n_keywords = self.top_n_extractor(similarity_model, top_n)
|
39 |
+
|
40 |
+
def count_vectorizer(self, n_gram):
|
41 |
+
n_gram_range = (n_gram, n_gram)
|
42 |
+
# Extract candidate words/phrases
|
43 |
+
count = CountVectorizer(ngram_range=n_gram_range,
|
44 |
+
stop_words = self.french_stopwords).fit([self.article_text]) #Main change
|
45 |
+
candidates = count.get_feature_names_out()
|
46 |
+
|
47 |
+
return candidates
|
48 |
+
|
49 |
+
def slice_only_noun_token(self, ner, token_list):
|
50 |
+
"""
|
51 |
+
Given the tokenized list, this function returns only the "NOUN" token
|
52 |
+
Args:
|
53 |
+
ner (spacy): The NER class to detect the `token.pos_`
|
54 |
+
token_list (list): List of token from the full article
|
55 |
+
Returns:
|
56 |
+
slice_list (list): List of token containing only "NOUN" part of speech
|
57 |
+
"""
|
58 |
+
|
59 |
+
noun_slice_list = []
|
60 |
+
proper_noun_slice_list = []
|
61 |
+
for word_idx in range(len(token_list)):
|
62 |
+
doc = ner(token_list[word_idx])
|
63 |
+
|
64 |
+
for token in doc:
|
65 |
+
if token.pos_ == 'NOUN':
|
66 |
+
noun_slice_list.append(token.text)
|
67 |
+
elif token.pos_ == 'PROPN':
|
68 |
+
proper_noun_slice_list.append(token.text)
|
69 |
+
|
70 |
+
return noun_slice_list, proper_noun_slice_list
|
71 |
+
|
72 |
+
def top_n_extractor(self, model, top_n):
|
73 |
+
doc_embedding = model.encode([self.article_text])
|
74 |
+
candidate_embeddings = model.encode(self.noun_candidates)
|
75 |
+
distances = cosine_similarity(doc_embedding, candidate_embeddings)
|
76 |
+
keywords = [self.noun_candidates[index] for index in distances.argsort()[0][-top_n:]]
|
77 |
+
|
78 |
+
return keywords
|
79 |
+
|
80 |
+
def clear_input():
|
81 |
+
return ("", "")
|
82 |
+
|
83 |
def t5_generate_summary(article_text):
|
84 |
input_ids = t5_tokenizer(
|
85 |
[WHITESPACE_HANDLER(article_text)],
|