pritamdeka commited on
Commit
0dd2adc
Β·
1 Parent(s): 7169eb0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -35
app.py CHANGED
@@ -54,31 +54,27 @@ sp = en_core_sci_lg.load()
54
  all_stopwords = sp.Defaults.stop_words
55
 
56
 
 
 
 
57
 
58
 
59
 
60
-
61
- word_embedding_model = models.Transformer('cambridgeltl/SapBERT-from-PubMedBERT-fulltext')
62
- pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
63
  pooling_mode_mean_tokens=True,
64
  pooling_mode_cls_token=False,
65
  pooling_mode_max_tokens=False)
66
 
67
- embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
68
-
69
-
70
- def remove_stopwords(sen):
71
- sen_new = " ".join([i for i in sen if i not in stop_words])
72
- return sen_new
73
-
74
-
75
-
76
- def keyphrase_generator(article_link, model_1, model_2, max_num_keywords):
77
  element=[]
78
  cluster_list_final=[]
79
  comb_list=[]
80
  comb=[]
81
  title_list=[]
 
 
82
  silhouette_score_list=[]
83
  final_textrank_list=[]
84
  document=[]
@@ -89,8 +85,6 @@ def keyphrase_generator(article_link, model_1, model_2, max_num_keywords):
89
  model_1 = SentenceTransformer(model_1)
90
  model_2 = SentenceTransformer(model_2)
91
  url = article_link
92
- if (url == False):
93
- print("error")
94
  html = requests.get(url).text
95
  article = fulltext(html)
96
  corpus=sent_tokenize(article)
@@ -203,33 +197,42 @@ def keyphrase_generator(article_link, model_1, model_2, max_num_keywords):
203
  last_url='esearch.fcgi?db=pubmed'+'&term='+f_1
204
  search_rettype = '&rettype=json'
205
  overall_url=ncbi_url+last_url+search_rettype+'&sort=relevance'
206
- r = requests.get(overall_url)
207
 
208
- root = ET.fromstring(r.text)
209
  levels = root.findall('.//Id')
210
- name_list=[]
211
  for level in levels:
212
  name = level.text
213
- name_list.append(name)
214
- name_1 = ','.join(name_list)
215
  fetch_url='efetch.fcgi?db=pubmed'
216
- search_id='&id='+name_1
217
  ret_type='&rettype=text'
218
  ret_mode='&retmode=xml'
219
- ret_max='&retmax=10'
220
  ret_sort='&sort=relevance'
221
  return_url=ncbi_url+fetch_url+search_id+ret_type+ret_mode+ret_max+ret_sort
222
- r_1 = requests.get(return_url)
223
- root_1 = ET.fromstring(r_1.text)
224
- levels_1 = root_1.findall('.//ArticleTitle')
225
- for level in levels_1:
226
- name = level.text
227
- title_list.append(name)
 
 
 
 
 
 
 
 
228
 
229
 
230
- return title_list
231
 
232
- gr.Interface(keyphrase_generator,
 
233
  inputs=[gr.inputs.Textbox(lines=1, placeholder="Provide article web link here",default="", label="Article web link"),
234
  gr.inputs.Dropdown(choices=['sentence-transformers/all-mpnet-base-v2',
235
  'sentence-transformers/all-mpnet-base-v1',
@@ -261,13 +264,21 @@ gr.Interface(keyphrase_generator,
261
  'sentence-transformers/all-MiniLM-L6-v2'],
262
  type="value",
263
  default='sentence-transformers/all-mpnet-base-v1',
264
- label="Select any SBERT model for keyphrases from the list below"),
265
- gr.inputs.Slider(minimum=5, maximum=30, step=1, default=10, label="Max Keywords")],
266
- outputs=gr.outputs.Textbox(type="auto", label="Stuff"),
 
 
 
 
 
267
  theme="peach",
268
  title="Scientific Article Keyphrase Generator", description="Generates the keyphrases from an article which best describes the article.",
269
- article= "The work is based on a part of the paper <a href=https://dl.acm.org/doi/10.1145/3487664.3487701>provided here</a>."
270
  "\t It uses the TextRank algorithm with SBERT to first find the top sentences and then extracts the keyphrases from those sentences using scispaCy and SBERT."
 
271
  "\t The list of SBERT models required in the textboxes can be found in <a href=www.sbert.net/docs/pretrained_models.html>SBERT Pre-trained models hub</a>."
272
  "\t The default model names are provided which can be changed from the list of pretrained models. "
273
- "\t The value of output keyphrases can be changed. The default value is 10, minimum is 5 and a maximum value of 30.").launch(share=True,server_name='0.0.0.0',show_error=True)
 
 
 
54
  all_stopwords = sp.Defaults.stop_words
55
 
56
 
57
+ def remove_stopwords(sen):
58
+ sen_new = " ".join([i for i in sen if i not in stop_words])
59
+ return sen_new
60
 
61
 
62
 
63
+ def keyphrase_generator(article_link, model_1, model_2, max_num_keywords, model_3):
64
+ word_embedding_model = models.Transformer(model_3)
65
+ pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
66
  pooling_mode_mean_tokens=True,
67
  pooling_mode_cls_token=False,
68
  pooling_mode_max_tokens=False)
69
 
70
+ embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
 
 
 
 
 
 
 
 
 
71
  element=[]
72
  cluster_list_final=[]
73
  comb_list=[]
74
  comb=[]
75
  title_list=[]
76
+ titles_list=[]
77
+ abstracts_list=[]
78
  silhouette_score_list=[]
79
  final_textrank_list=[]
80
  document=[]
 
85
  model_1 = SentenceTransformer(model_1)
86
  model_2 = SentenceTransformer(model_2)
87
  url = article_link
 
 
88
  html = requests.get(url).text
89
  article = fulltext(html)
90
  corpus=sent_tokenize(article)
 
197
  last_url='esearch.fcgi?db=pubmed'+'&term='+f_1
198
  search_rettype = '&rettype=json'
199
  overall_url=ncbi_url+last_url+search_rettype+'&sort=relevance'
200
+ pubmed_search_request = requests.get(overall_url)
201
 
202
+ root = ET.fromstring(pubmed_search_request.text)
203
  levels = root.findall('.//Id')
204
+ search_id_list=[]
205
  for level in levels:
206
  name = level.text
207
+ search_id_list.append(name)
208
+ all_search_ids = ','.join(search_id_list)
209
  fetch_url='efetch.fcgi?db=pubmed'
210
+ search_id='&id='+all_search_ids
211
  ret_type='&rettype=text'
212
  ret_mode='&retmode=xml'
213
+ ret_max='&retmax=20'
214
  ret_sort='&sort=relevance'
215
  return_url=ncbi_url+fetch_url+search_id+ret_type+ret_mode+ret_max+ret_sort
216
+ pubmed_abstract_request = requests.get(return_url)
217
+ root_1 = ET.fromstring(pubmed_abstract_request.text)
218
+ article_title = root_1.findall('.//ArticleTitle')
219
+ for a in article_title:
220
+ article_title_name = a.text
221
+ titles_list.append(article_title_name)
222
+ article_abstract = root_1.findall('.//AbstractText')
223
+ for b in article_abstract:
224
+ article_abstract_name = b.text
225
+ abstracts_list.append(article_abstract_name)
226
+ mydict = {'Title': titles_list, 'Abstract':abstracts_list}
227
+
228
+
229
+ df_new = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in mydict.items() ]))
230
 
231
 
232
+ return df_new
233
 
234
+
235
+ igen_pubmed = gr.Interface(keyphrase_generator,
236
  inputs=[gr.inputs.Textbox(lines=1, placeholder="Provide article web link here",default="", label="Article web link"),
237
  gr.inputs.Dropdown(choices=['sentence-transformers/all-mpnet-base-v2',
238
  'sentence-transformers/all-mpnet-base-v1',
 
264
  'sentence-transformers/all-MiniLM-L6-v2'],
265
  type="value",
266
  default='sentence-transformers/all-mpnet-base-v1',
267
+ label="Select any SBERT model for keyphrases from the list below"),
268
+ gr.inputs.Slider(minimum=5, maximum=30, step=1, default=10, label="Max Keywords")
269
+ gr.inputs.Dropdown(choices=['cambridgeltl/SapBERT-from-PubMedBERT-fulltext',
270
+ 'cambridgeltl/SapBERT-from-PubMedBERT-fulltext-mean-token'],
271
+ type="value",
272
+ default='cambridgeltl/SapBERT-from-PubMedBERT-fulltext',
273
+ label="Select any SapBERT model for clustering from the list below"),],
274
+ outputs=gr.outputs.Dataframe(type="auto", label="dataframe",max_rows=10, max_cols=None, overflow_row_behaviour="paginate"),
275
  theme="peach",
276
  title="Scientific Article Keyphrase Generator", description="Generates the keyphrases from an article which best describes the article.",
277
+ article= "The work is based the paper <a href=https://dl.acm.org/doi/10.1145/3487664.3487701>provided here</a>."
278
  "\t It uses the TextRank algorithm with SBERT to first find the top sentences and then extracts the keyphrases from those sentences using scispaCy and SBERT."
279
+ "\t The application then uses a <a href=https://arxiv.org/abs/2010.11784>UMLS based Bert model</a> to cluster the keyphrases using K-means clustering method and finally create a boolean query. After that the top 20 titles and abstracts are retrieved from PubMed database and displayed according to relevancy. The UMLS Bert models can be chosen from the list provided. "
280
  "\t The list of SBERT models required in the textboxes can be found in <a href=www.sbert.net/docs/pretrained_models.html>SBERT Pre-trained models hub</a>."
281
  "\t The default model names are provided which can be changed from the list of pretrained models. "
282
+ "\t The value of keyphrases can be changed. The default value is 10, minimum is 5 and a maximum value of 30.")
283
+
284
+ igen_pubmed.launch(share=True,server_name='0.0.0.0',show_error=True)