Spaces:
Runtime error
Runtime error
Commit
Β·
0dd2adc
1
Parent(s):
7169eb0
Update app.py
Browse files
app.py
CHANGED
@@ -54,31 +54,27 @@ sp = en_core_sci_lg.load()
|
|
54 |
all_stopwords = sp.Defaults.stop_words
|
55 |
|
56 |
|
|
|
|
|
|
|
57 |
|
58 |
|
59 |
|
60 |
-
|
61 |
-
word_embedding_model = models.Transformer(
|
62 |
-
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
|
63 |
pooling_mode_mean_tokens=True,
|
64 |
pooling_mode_cls_token=False,
|
65 |
pooling_mode_max_tokens=False)
|
66 |
|
67 |
-
embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
|
68 |
-
|
69 |
-
|
70 |
-
def remove_stopwords(sen):
|
71 |
-
sen_new = " ".join([i for i in sen if i not in stop_words])
|
72 |
-
return sen_new
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
def keyphrase_generator(article_link, model_1, model_2, max_num_keywords):
|
77 |
element=[]
|
78 |
cluster_list_final=[]
|
79 |
comb_list=[]
|
80 |
comb=[]
|
81 |
title_list=[]
|
|
|
|
|
82 |
silhouette_score_list=[]
|
83 |
final_textrank_list=[]
|
84 |
document=[]
|
@@ -89,8 +85,6 @@ def keyphrase_generator(article_link, model_1, model_2, max_num_keywords):
|
|
89 |
model_1 = SentenceTransformer(model_1)
|
90 |
model_2 = SentenceTransformer(model_2)
|
91 |
url = article_link
|
92 |
-
if (url == False):
|
93 |
-
print("error")
|
94 |
html = requests.get(url).text
|
95 |
article = fulltext(html)
|
96 |
corpus=sent_tokenize(article)
|
@@ -203,33 +197,42 @@ def keyphrase_generator(article_link, model_1, model_2, max_num_keywords):
|
|
203 |
last_url='esearch.fcgi?db=pubmed'+'&term='+f_1
|
204 |
search_rettype = '&rettype=json'
|
205 |
overall_url=ncbi_url+last_url+search_rettype+'&sort=relevance'
|
206 |
-
|
207 |
|
208 |
-
root = ET.fromstring(
|
209 |
levels = root.findall('.//Id')
|
210 |
-
|
211 |
for level in levels:
|
212 |
name = level.text
|
213 |
-
|
214 |
-
|
215 |
fetch_url='efetch.fcgi?db=pubmed'
|
216 |
-
search_id='&id='+
|
217 |
ret_type='&rettype=text'
|
218 |
ret_mode='&retmode=xml'
|
219 |
-
ret_max='&retmax=
|
220 |
ret_sort='&sort=relevance'
|
221 |
return_url=ncbi_url+fetch_url+search_id+ret_type+ret_mode+ret_max+ret_sort
|
222 |
-
|
223 |
-
root_1 = ET.fromstring(
|
224 |
-
|
225 |
-
for
|
226 |
-
|
227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
|
229 |
|
230 |
-
return
|
231 |
|
232 |
-
|
|
|
233 |
inputs=[gr.inputs.Textbox(lines=1, placeholder="Provide article web link here",default="", label="Article web link"),
|
234 |
gr.inputs.Dropdown(choices=['sentence-transformers/all-mpnet-base-v2',
|
235 |
'sentence-transformers/all-mpnet-base-v1',
|
@@ -261,13 +264,21 @@ gr.Interface(keyphrase_generator,
|
|
261 |
'sentence-transformers/all-MiniLM-L6-v2'],
|
262 |
type="value",
|
263 |
default='sentence-transformers/all-mpnet-base-v1',
|
264 |
-
label="Select any SBERT model for keyphrases from the list below"),
|
265 |
-
gr.inputs.Slider(minimum=5, maximum=30, step=1, default=10, label="Max Keywords")
|
266 |
-
|
|
|
|
|
|
|
|
|
|
|
267 |
theme="peach",
|
268 |
title="Scientific Article Keyphrase Generator", description="Generates the keyphrases from an article which best describes the article.",
|
269 |
-
article= "The work is based
|
270 |
"\t It uses the TextRank algorithm with SBERT to first find the top sentences and then extracts the keyphrases from those sentences using scispaCy and SBERT."
|
|
|
271 |
"\t The list of SBERT models required in the textboxes can be found in <a href=www.sbert.net/docs/pretrained_models.html>SBERT Pre-trained models hub</a>."
|
272 |
"\t The default model names are provided which can be changed from the list of pretrained models. "
|
273 |
-
"\t The value of
|
|
|
|
|
|
54 |
all_stopwords = sp.Defaults.stop_words
|
55 |
|
56 |
|
57 |
+
def remove_stopwords(sen):
|
58 |
+
sen_new = " ".join([i for i in sen if i not in stop_words])
|
59 |
+
return sen_new
|
60 |
|
61 |
|
62 |
|
63 |
+
def keyphrase_generator(article_link, model_1, model_2, max_num_keywords, model_3):
|
64 |
+
word_embedding_model = models.Transformer(model_3)
|
65 |
+
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
|
66 |
pooling_mode_mean_tokens=True,
|
67 |
pooling_mode_cls_token=False,
|
68 |
pooling_mode_max_tokens=False)
|
69 |
|
70 |
+
embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
element=[]
|
72 |
cluster_list_final=[]
|
73 |
comb_list=[]
|
74 |
comb=[]
|
75 |
title_list=[]
|
76 |
+
titles_list=[]
|
77 |
+
abstracts_list=[]
|
78 |
silhouette_score_list=[]
|
79 |
final_textrank_list=[]
|
80 |
document=[]
|
|
|
85 |
model_1 = SentenceTransformer(model_1)
|
86 |
model_2 = SentenceTransformer(model_2)
|
87 |
url = article_link
|
|
|
|
|
88 |
html = requests.get(url).text
|
89 |
article = fulltext(html)
|
90 |
corpus=sent_tokenize(article)
|
|
|
197 |
last_url='esearch.fcgi?db=pubmed'+'&term='+f_1
|
198 |
search_rettype = '&rettype=json'
|
199 |
overall_url=ncbi_url+last_url+search_rettype+'&sort=relevance'
|
200 |
+
pubmed_search_request = requests.get(overall_url)
|
201 |
|
202 |
+
root = ET.fromstring(pubmed_search_request.text)
|
203 |
levels = root.findall('.//Id')
|
204 |
+
search_id_list=[]
|
205 |
for level in levels:
|
206 |
name = level.text
|
207 |
+
search_id_list.append(name)
|
208 |
+
all_search_ids = ','.join(search_id_list)
|
209 |
fetch_url='efetch.fcgi?db=pubmed'
|
210 |
+
search_id='&id='+all_search_ids
|
211 |
ret_type='&rettype=text'
|
212 |
ret_mode='&retmode=xml'
|
213 |
+
ret_max='&retmax=20'
|
214 |
ret_sort='&sort=relevance'
|
215 |
return_url=ncbi_url+fetch_url+search_id+ret_type+ret_mode+ret_max+ret_sort
|
216 |
+
pubmed_abstract_request = requests.get(return_url)
|
217 |
+
root_1 = ET.fromstring(pubmed_abstract_request.text)
|
218 |
+
article_title = root_1.findall('.//ArticleTitle')
|
219 |
+
for a in article_title:
|
220 |
+
article_title_name = a.text
|
221 |
+
titles_list.append(article_title_name)
|
222 |
+
article_abstract = root_1.findall('.//AbstractText')
|
223 |
+
for b in article_abstract:
|
224 |
+
article_abstract_name = b.text
|
225 |
+
abstracts_list.append(article_abstract_name)
|
226 |
+
mydict = {'Title': titles_list, 'Abstract':abstracts_list}
|
227 |
+
|
228 |
+
|
229 |
+
df_new = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in mydict.items() ]))
|
230 |
|
231 |
|
232 |
+
return df_new
|
233 |
|
234 |
+
|
235 |
+
igen_pubmed = gr.Interface(keyphrase_generator,
|
236 |
inputs=[gr.inputs.Textbox(lines=1, placeholder="Provide article web link here",default="", label="Article web link"),
|
237 |
gr.inputs.Dropdown(choices=['sentence-transformers/all-mpnet-base-v2',
|
238 |
'sentence-transformers/all-mpnet-base-v1',
|
|
|
264 |
'sentence-transformers/all-MiniLM-L6-v2'],
|
265 |
type="value",
|
266 |
default='sentence-transformers/all-mpnet-base-v1',
|
267 |
+
label="Select any SBERT model for keyphrases from the list below"),
|
268 |
+
gr.inputs.Slider(minimum=5, maximum=30, step=1, default=10, label="Max Keywords")
|
269 |
+
gr.inputs.Dropdown(choices=['cambridgeltl/SapBERT-from-PubMedBERT-fulltext',
|
270 |
+
'cambridgeltl/SapBERT-from-PubMedBERT-fulltext-mean-token'],
|
271 |
+
type="value",
|
272 |
+
default='cambridgeltl/SapBERT-from-PubMedBERT-fulltext',
|
273 |
+
label="Select any SapBERT model for clustering from the list below"),],
|
274 |
+
outputs=gr.outputs.Dataframe(type="auto", label="dataframe",max_rows=10, max_cols=None, overflow_row_behaviour="paginate"),
|
275 |
theme="peach",
|
276 |
title="Scientific Article Keyphrase Generator", description="Generates the keyphrases from an article which best describes the article.",
|
277 |
+
article= "The work is based the paper <a href=https://dl.acm.org/doi/10.1145/3487664.3487701>provided here</a>."
|
278 |
"\t It uses the TextRank algorithm with SBERT to first find the top sentences and then extracts the keyphrases from those sentences using scispaCy and SBERT."
|
279 |
+
"\t The application then uses a <a href=https://arxiv.org/abs/2010.11784>UMLS based Bert model</a> to cluster the keyphrases using K-means clustering method and finally create a boolean query. After that the top 20 titles and abstracts are retrieved from PubMed database and displayed according to relevancy. The UMLS Bert models can be chosen from the list provided. "
|
280 |
"\t The list of SBERT models required in the textboxes can be found in <a href=www.sbert.net/docs/pretrained_models.html>SBERT Pre-trained models hub</a>."
|
281 |
"\t The default model names are provided which can be changed from the list of pretrained models. "
|
282 |
+
"\t The value of keyphrases can be changed. The default value is 10, minimum is 5 and a maximum value of 30.")
|
283 |
+
|
284 |
+
igen_pubmed.launch(share=True,server_name='0.0.0.0',show_error=True)
|