jattokatarratto commited on
Commit
233331d
·
verified ·
1 Parent(s): 8102060

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -43
app.py CHANGED
@@ -20,8 +20,10 @@ from typing import Dict
20
  from concurrent.futures import ThreadPoolExecutor, as_completed
21
  from collections import Counter
22
 
 
23
  import torch
24
 
 
25
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
26
  print(f"Device: {device}...")
27
  if device.type == "cuda":
@@ -65,7 +67,9 @@ examples = [
65
  models_List = ["Babelscape/wikineural-multilingual-ner", "urchade/gliner_large-v2.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1", "knowledgator/gliner-multitask-large-v0.5"
66
  #models_List = ["NCBO/BioPortal" ]
67
 
68
- categories_List = ["MED","LOC","PER","ORG","DATE","MISC"]
 
 
69
 
70
  modelGliner=None
71
  modelGlinerBio=None
@@ -366,6 +370,12 @@ def process_row_BioPortal_api(args, key_bioportal, row):
366
  url = f"https://services.data.bioontology.org/annotatorplus/?text={context_to_annotate}&ontologies={onto_clauses}&longest_only=true&exclude_numbers=true&whole_word_only=true&exclude_synonyms=false&negation=false&experiencer=false&temporality=false&score_threshold=0&confidence_threshold=0&display_links=false&display_context=false&score=cvalue&apikey={key_bioportal}"
367
 
368
  else:
 
 
 
 
 
 
369
  # args.KG_restriction does not exist or is empty
370
  if strtobool(args.debug):
371
  print("--- BIOPORTAL: " + context_to_annotate)
@@ -1148,6 +1158,12 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
1148
 
1149
  else:
1150
  # args.KG_restriction does not exist or is empty
 
 
 
 
 
 
1151
  if strtobool(args.debug):
1152
  print("--- " + word.lower())
1153
  print("KG_restriction is not provided or empty - Consider all the KGs in the virtuoso endpoint")
@@ -1228,7 +1244,8 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
1228
 
1229
  if not data:
1230
  # nothing found from Bioportal
1231
- return None, None, None, None, None, cache_map_virtuoso
 
1232
 
1233
  dff = pd.DataFrame(data)
1234
  dff = dff.drop(columns=['hierarchy', 'mappings'])
@@ -1276,7 +1293,8 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
1276
 
1277
  if df_expanded.empty:
1278
  # nothing found from Bioportal
1279
- return None, None, None, None, None, cache_map_virtuoso
 
1280
 
1281
  # Specify the columns you want to keep
1282
  columns_to_keep = ["score", "from", "to", "prefLabel", "text", "@id"]
@@ -1335,13 +1353,15 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
1335
 
1336
  else:
1337
  #nothing found from Bioportal
1338
- return None, None, None, None, None, cache_map_virtuoso
 
1339
 
1340
 
1341
  except Exception as err:
1342
  logging.error(
1343
  f'ERROR ON BioPortal Annotator API Call\n\tError: {err}\n TextToAnnotate: {word.lower()}\n Have a check...')
1344
- return None, None, None, None, None, cache_map_virtuoso
 
1345
 
1346
  except Exception as err:
1347
 
@@ -1350,7 +1370,8 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
1350
  # cache_map_virtuoso[word] = {}
1351
  # cache_map_virtuoso[word][contextWordVirtuoso] = None
1352
 
1353
- return None, None, None, None, None, cache_map_virtuoso
 
1354
 
1355
 
1356
  return entityBioeUrl, ALLURIScontext, cache_map_virtuoso
@@ -1566,18 +1587,18 @@ def getLinearTextualContextFromTriples(word,labelTriplesLIST, text_splitter, arg
1566
  try:
1567
 
1568
  contextText = ""
1569
- if args.service_provider == "gptjrc":
1570
- contextText = call_model(input_text=labelTriples, prompt=myPromt, model=args.model_name,
1571
- temperature=args.temperature, delimiter=myDelimiter,
1572
- InContextExamples=[],
1573
- handler=api_call_gptjrc,
1574
- verbose=True, args=args)
1575
- elif args.service_provider == "HFonPremises":
1576
- contextText = call_model(input_text=labelTriples, prompt=myPromt, model=args.model_name,
1577
- temperature=args.temperature, delimiter=myDelimiter,
1578
- InContextExamples=[],
1579
- handler=api_call_HFonPremises,
1580
- verbose=True, args=args)
1581
 
1582
 
1583
 
@@ -2257,7 +2278,7 @@ def elinking(df_annotated_combined, text_splitter, args, key_geonames, cache_map
2257
 
2258
 
2259
 
2260
- def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking, KGchoices, history_dict: dict):
2261
 
2262
  if EntityLinking:
2263
  EnableNEL="True"
@@ -2266,7 +2287,12 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
2266
 
2267
  if not text:
2268
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
2269
- return {"text": text, "entities": []}, html_output, dict()
 
 
 
 
 
2270
 
2271
  df_annotated = pd.DataFrame()
2272
 
@@ -2326,9 +2352,18 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
2326
 
2327
  df_ToAnnotate = pd.DataFrame()
2328
 
2329
- #print("Are all models in any row of the 'model' column, case-insensitively?", all_models_in_any_row)
2330
- #if (not history_dict) or (history_dict[args.source_column][0] != text) or (all_models_in_any_row == False):
2331
- if (not history_dict) or (history_dict[args.source_column][0] != text):
 
 
 
 
 
 
 
 
 
2332
 
2333
  for model_id in models_List: # always do all the annotations, only filter them afterwards
2334
  #for model_id in ModelsSelection:
@@ -2377,7 +2412,11 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
2377
  # If df_annotated is not empty, concatenate new_annotations to it
2378
  df_annotated = pd.concat([df_annotated, new_annotations], ignore_index=True)
2379
 
2380
- history = df_annotated.copy()
 
 
 
 
2381
 
2382
  else:
2383
 
@@ -2392,9 +2431,16 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
2392
  torch.cuda.manual_seed_all(args.SEED)
2393
  ###
2394
 
2395
- history = pd.DataFrame(history_dict)
2396
  df_annotated = history.copy()
2397
 
 
 
 
 
 
 
 
2398
 
2399
  quoted_text = text.startswith('"') & text.endswith('"')
2400
  if (not df_annotated.empty) or quoted_text:
@@ -2404,14 +2450,14 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
2404
  df_annotated = df_annotated[df_annotated['model'].str.lower().isin([model.lower() for model in ModelsSelection])]
2405
  if df_annotated.empty and quoted_text==False:
2406
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
2407
- return {"text": text, "entities": []}, html_output, history.to_dict()
2408
 
2409
  df_annotated_combined = pd.DataFrame()
2410
  if (not df_annotated.empty):
2411
  df_annotated_combined = entitiesFusion(df_annotated,args)
2412
  if df_annotated_combined.empty and quoted_text==False:
2413
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
2414
- return {"text": text, "entities": []}, html_output, history.to_dict()
2415
  else:
2416
  df_annotated_combined = is_cross_inside(df_annotated_combined, args, 0.999) #I cut all the cross inside with the 0.99. to avoid the linking
2417
 
@@ -2566,15 +2612,19 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
2566
  [cat.lower() for cat in CategoriesSelection])
2567
  if "MED" in CategoriesSelection:
2568
  filter_mask |= df_annotated_combined['entity_group'].str.lower().isin(
2569
- [cat.lower() for cat in CategoriesSelection]) | (df_annotated_combined['IsBio'] == 1)
2570
  if "MISC" in CategoriesSelection:
2571
- #filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List]))
2572
- filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) & ~(df_annotated_combined['IsBio'] == 1) # with this cluase, I'm including not only the categories labelled as MISC, but also the other that are not MED, PER, ORG, LOC
 
 
 
 
2573
 
2574
  df_annotated_combined = df_annotated_combined[filter_mask]
2575
  if df_annotated_combined.empty:
2576
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
2577
- return {"text": text, "entities": []}, html_output, history.to_dict()
2578
 
2579
  ###
2580
 
@@ -2584,7 +2634,7 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
2584
  df_annotated_combined = df_annotated_combined[df_annotated_combined['IsCrossInside'] != 1]
2585
  if df_annotated_combined.empty:
2586
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
2587
- return {"text": text, "entities": []}, html_output, history.to_dict()
2588
 
2589
  dict_annotated_combined_NER = df_annotated_combined[["end", "entity_group", "score", "start", "word"]].to_dict(orient="records")
2590
 
@@ -2608,9 +2658,8 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
2608
  # 'word'], axis=1)
2609
  df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
2610
  lambda
2611
- row: f"<a href='https://api-vast.jrc.service.ec.europa.eu/describe//?url={row['namedEntity']}' target='_blank'>{row['word']}</a>" if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[
2612
- 'word'], axis=1)
2613
-
2614
 
2615
  # Create a new dictionary with the entity information and the link
2616
  dict_annotated_combined_NEL = df_annotated_combined[
@@ -2676,16 +2725,16 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
2676
  # # f'<span style="background-color: {color}">{entity_text}</span>') - (end - start)
2677
 
2678
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text_with_links}</div>"
2679
- return {"text": text, "entities": dict_annotated_combined_NER}, html_output, history.to_dict()
2680
 
2681
  else:
2682
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
2683
- return {"text": text, "entities": dict_annotated_combined_NER}, html_output, history.to_dict()
2684
 
2685
  else:
2686
 
2687
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
2688
- return {"text": text, "entities": []}, html_output, history.to_dict()
2689
 
2690
 
2691
 
@@ -2709,13 +2758,37 @@ demo = gr.Interface(
2709
  live=True,
2710
  title="BioAnnotator: Biomedical Named-Entity Recognition (NER) and Linking (NEL)",
2711
  description="""Interoperability – the capability of systems and organisations to cooperate across functional, sectoral and physical borders – is key for successful digital transformation.
2712
- The [Interoperable Europe Act](https://interoperable-europe.ec.europa.eu/interoperable-europe/interoperable-europe-act) is an EU regulation that aims to strengthen public sector interoperability and will serve as a main EC policy framework for the years to come.
2713
- Data exchange is vital for digital government policies, and semantic interoperability ensures systems understand each other despite different legacies and architectures.
2714
 
2715
- In this demo we show in particular the *BioAnnotator*, a prototype tool performing Biomedical Named-Entity Recognition (NER) and Linking (NEL). To give it a try, please select one or more NER models and enter some text to get it processed. Please select also the entity categories you want to extract, as well as the score to use as a threshold for the NER extraction. Finally, select whether you want to perform Named-Entity Linking (NEL) and if you want to enable the filtering to some specific biomedical ontologies only (acronyms description at: https://bioportal.bioontology.org/ontologies. See also: https://citnet.tech.ec.europa.eu/CITnet/confluence/display/DIGHEALTH/Inventory+of+existing+KGs+related+to+the+Digital+Health+domain).
2716
- """,
2717
  examples=examples,
2718
- cache_examples=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2719
  )
2720
 
2721
 
 
20
  from concurrent.futures import ThreadPoolExecutor, as_completed
21
  from collections import Counter
22
 
23
+
24
  import torch
25
 
26
+
27
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
28
  print(f"Device: {device}...")
29
  if device.type == "cuda":
 
67
  models_List = ["Babelscape/wikineural-multilingual-ner", "urchade/gliner_large-v2.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1", "knowledgator/gliner-multitask-large-v0.5"
68
  #models_List = ["NCBO/BioPortal" ]
69
 
70
+ #categories_List = ["MED","LOC","PER","ORG","DATE","MISC"]
71
+ categories_List = ["MED","LOC","PER","ORG","DATE","MISC", "CONC", "BIOP", "ACTI", "ANAT", "CHEM", "DEVI", "DISO", "GENE", "GEOG", "LIVB", "OBJC", "OCCU", "ORGA", "PHEN", "PHYS" , "PROC"]
72
+
73
 
74
  modelGliner=None
75
  modelGlinerBio=None
 
370
  url = f"https://services.data.bioontology.org/annotatorplus/?text={context_to_annotate}&ontologies={onto_clauses}&longest_only=true&exclude_numbers=true&whole_word_only=true&exclude_synonyms=false&negation=false&experiencer=false&temporality=false&score_threshold=0&confidence_threshold=0&display_links=false&display_context=false&score=cvalue&apikey={key_bioportal}"
371
 
372
  else:
373
+
374
+ kg_restriction = getattr(args, 'KG_restriction', None)
375
+ if kg_restriction is not None and len(kg_restriction) == 0:
376
+ print("KG_restriction is provided but empty")
377
+ return pd.DataFrame()
378
+
379
  # args.KG_restriction does not exist or is empty
380
  if strtobool(args.debug):
381
  print("--- BIOPORTAL: " + context_to_annotate)
 
1158
 
1159
  else:
1160
  # args.KG_restriction does not exist or is empty
1161
+
1162
+ kg_restriction = getattr(args, 'KG_restriction', None)
1163
+ if kg_restriction is not None and len(kg_restriction) == 0:
1164
+ print("KG_restriction is provided but empty")
1165
+ return None, None, cache_map_virtuoso
1166
+
1167
  if strtobool(args.debug):
1168
  print("--- " + word.lower())
1169
  print("KG_restriction is not provided or empty - Consider all the KGs in the virtuoso endpoint")
 
1244
 
1245
  if not data:
1246
  # nothing found from Bioportal
1247
+ #return None, None, None, None, None, cache_map_virtuoso
1248
+ return None, None, cache_map_virtuoso
1249
 
1250
  dff = pd.DataFrame(data)
1251
  dff = dff.drop(columns=['hierarchy', 'mappings'])
 
1293
 
1294
  if df_expanded.empty:
1295
  # nothing found from Bioportal
1296
+ #return None, None, None, None, None, cache_map_virtuoso
1297
+ return None, None, cache_map_virtuoso
1298
 
1299
  # Specify the columns you want to keep
1300
  columns_to_keep = ["score", "from", "to", "prefLabel", "text", "@id"]
 
1353
 
1354
  else:
1355
  #nothing found from Bioportal
1356
+ #return None, None, None, None, None, cache_map_virtuoso
1357
+ return None, None, cache_map_virtuoso
1358
 
1359
 
1360
  except Exception as err:
1361
  logging.error(
1362
  f'ERROR ON BioPortal Annotator API Call\n\tError: {err}\n TextToAnnotate: {word.lower()}\n Have a check...')
1363
+ #return None, None, None, None, None, cache_map_virtuoso
1364
+ return None, None, cache_map_virtuoso
1365
 
1366
  except Exception as err:
1367
 
 
1370
  # cache_map_virtuoso[word] = {}
1371
  # cache_map_virtuoso[word][contextWordVirtuoso] = None
1372
 
1373
+ #return None, None, None, None, None, cache_map_virtuoso
1374
+ return None, None, cache_map_virtuoso
1375
 
1376
 
1377
  return entityBioeUrl, ALLURIScontext, cache_map_virtuoso
 
1587
  try:
1588
 
1589
  contextText = ""
1590
+ # if args.service_provider == "gptjrc":
1591
+ # contextText = call_model(input_text=labelTriples, prompt=myPromt, model=args.model_name,
1592
+ # temperature=args.temperature, delimiter=myDelimiter,
1593
+ # InContextExamples=[],
1594
+ # handler=api_call_gptjrc,
1595
+ # verbose=True, args=args)
1596
+ # elif args.service_provider == "HFonPremises":
1597
+ # contextText = call_model(input_text=labelTriples, prompt=myPromt, model=args.model_name,
1598
+ # temperature=args.temperature, delimiter=myDelimiter,
1599
+ # InContextExamples=[],
1600
+ # handler=api_call_HFonPremises,
1601
+ # verbose=True, args=args)
1602
 
1603
 
1604
 
 
2278
 
2279
 
2280
 
2281
+ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking, KGchoices, state: dict):
2282
 
2283
  if EntityLinking:
2284
  EnableNEL="True"
 
2287
 
2288
  if not text:
2289
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
2290
+ state = {
2291
+ "text": "",
2292
+ "df_annotated_dict": dict(),
2293
+ "KGchoices": KGchoices
2294
+ }
2295
+ return {"text": text, "entities": []}, html_output, state
2296
 
2297
  df_annotated = pd.DataFrame()
2298
 
 
2352
 
2353
  df_ToAnnotate = pd.DataFrame()
2354
 
2355
+ previous_text = ""
2356
+ previous_df_annotated_dict = dict()
2357
+ previous_kg_choices = []
2358
+ if state:
2359
+ previous_text = state.get("text", "")
2360
+ previous_df_annotated_dict = state.get("df_annotated_dict", {})
2361
+ previous_kg_choices = state.get("KGchoices", [])
2362
+
2363
+ # print("Are all models in any row of the 'model' column, case-insensitively?", all_models_in_any_row)
2364
+ # if (not history_dict) or (history_dict[args.source_column][0] != text) or (all_models_in_any_row == False):
2365
+ # if (not history_dict) or (history_dict[args.source_column][0] != text):
2366
+ if (not previous_df_annotated_dict) or (previous_text != text) or (sorted(previous_kg_choices) != sorted(KGchoices)):
2367
 
2368
  for model_id in models_List: # always do all the annotations, only filter them afterwards
2369
  #for model_id in ModelsSelection:
 
2412
  # If df_annotated is not empty, concatenate new_annotations to it
2413
  df_annotated = pd.concat([df_annotated, new_annotations], ignore_index=True)
2414
 
2415
+ state = {
2416
+ "text": text,
2417
+ "df_annotated_dict": df_annotated.to_dict(),
2418
+ "KGchoices": KGchoices
2419
+ }
2420
 
2421
  else:
2422
 
 
2431
  torch.cuda.manual_seed_all(args.SEED)
2432
  ###
2433
 
2434
+ history = pd.DataFrame(previous_df_annotated_dict)
2435
  df_annotated = history.copy()
2436
 
2437
+ state = {
2438
+ "text": text,
2439
+ "df_annotated_dict": df_annotated.to_dict(),
2440
+ "KGchoices": KGchoices
2441
+ }
2442
+
2443
+
2444
 
2445
  quoted_text = text.startswith('"') & text.endswith('"')
2446
  if (not df_annotated.empty) or quoted_text:
 
2450
  df_annotated = df_annotated[df_annotated['model'].str.lower().isin([model.lower() for model in ModelsSelection])]
2451
  if df_annotated.empty and quoted_text==False:
2452
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
2453
+ return {"text": text, "entities": []}, html_output, state
2454
 
2455
  df_annotated_combined = pd.DataFrame()
2456
  if (not df_annotated.empty):
2457
  df_annotated_combined = entitiesFusion(df_annotated,args)
2458
  if df_annotated_combined.empty and quoted_text==False:
2459
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
2460
+ return {"text": text, "entities": []}, html_output, state
2461
  else:
2462
  df_annotated_combined = is_cross_inside(df_annotated_combined, args, 0.999) #I cut all the cross inside with the 0.99. to avoid the linking
2463
 
 
2612
  [cat.lower() for cat in CategoriesSelection])
2613
  if "MED" in CategoriesSelection:
2614
  filter_mask |= df_annotated_combined['entity_group'].str.lower().isin(
2615
+ [cat.lower() for cat in CategoriesSelection]) & (df_annotated_combined['IsBio'] == 1)
2616
  if "MISC" in CategoriesSelection:
2617
+ # filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List]))
2618
+ # filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) & ~(df_annotated_combined['IsBio'] == 1) # with this cluase, I'm including not only the categories labelled as MISC, but also the other that are not MED, PER, ORG, LOC
2619
+ filter_mask |= ~(
2620
+ df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) & ~(
2621
+ df_annotated_combined[
2622
+ 'IsBio'] == 1) # with this cluase, I'm including not only the categories labelled as MISC, but also the other that are not MED, PER, ORG, LOC
2623
 
2624
  df_annotated_combined = df_annotated_combined[filter_mask]
2625
  if df_annotated_combined.empty:
2626
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
2627
+ return {"text": text, "entities": []}, html_output, state
2628
 
2629
  ###
2630
 
 
2634
  df_annotated_combined = df_annotated_combined[df_annotated_combined['IsCrossInside'] != 1]
2635
  if df_annotated_combined.empty:
2636
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
2637
+ return {"text": text, "entities": []}, html_output, state
2638
 
2639
  dict_annotated_combined_NER = df_annotated_combined[["end", "entity_group", "score", "start", "word"]].to_dict(orient="records")
2640
 
 
2658
  # 'word'], axis=1)
2659
  df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
2660
  lambda
2661
+ row: f"<a href='https://api-vast.jrc.service.ec.europa.eu/describe//?url={row['namedEntity']}' target='_blank'>{row['word']}</a>" if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[
2662
+ 'word'], axis=1)
 
2663
 
2664
  # Create a new dictionary with the entity information and the link
2665
  dict_annotated_combined_NEL = df_annotated_combined[
 
2725
  # # f'<span style="background-color: {color}">{entity_text}</span>') - (end - start)
2726
 
2727
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text_with_links}</div>"
2728
+ return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state
2729
 
2730
  else:
2731
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
2732
+ return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state
2733
 
2734
  else:
2735
 
2736
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
2737
+ return {"text": text, "entities": []}, html_output, state
2738
 
2739
 
2740
 
 
2758
  live=True,
2759
  title="BioAnnotator: Biomedical Named-Entity Recognition (NER) and Linking (NEL)",
2760
  description="""Interoperability – the capability of systems and organisations to cooperate across functional, sectoral and physical borders – is key for successful digital transformation.
2761
+ The [Interoperable Europe Act](https://interoperable-europe.ec.europa.eu/interoperable-europe/interoperable-europe-act) is an EU regulation that aims to strengthen public sector interoperability and will serve as a main EC policy framework for the years to come.
2762
+ Data exchange is vital for digital government policies, and semantic interoperability ensures systems understand each other despite different legacies and architectures.
2763
 
2764
+ In this demo we show in particular the *BioAnnotator*, a prototype tool performing Biomedical Named-Entity Recognition (NER) and Linking (NEL). To give it a try, please select one or more NER models and enter some text to get it processed. Please select also the entity categories you want to extract, as well as the score to use as a threshold for the NER extraction. Finally, select whether you want to perform Named-Entity Linking (NEL) and if you want to enable the filtering to some specific biomedical ontologies only (acronyms description at: https://bioportal.bioontology.org/ontologies). See also: [InventoryHealthKGs.pdf](https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/ETOHA/KGs/InventoryHealthKGs.pdf).
2765
+ """,
2766
  examples=examples,
2767
+ cache_examples=False,
2768
+ article="""
2769
+ **Categories Legend:**
2770
+ - MED | Medical
2771
+ - LOC | Locations
2772
+ - PER | Persons
2773
+ - ORG | Organizations
2774
+ - MISC | Miscellanea
2775
+ - CONC | Concepts & Ideas
2776
+ - BIOP | Biological
2777
+ - ACTI | Activities & Behaviors
2778
+ - ANAT | Anatomy
2779
+ - CHEM | Chemicals & Drugs
2780
+ - DEVI | Devices
2781
+ - DISO | Disorders
2782
+ - GENE | Genes & Molecular Sequences
2783
+ - GEOG | Geographic Areas
2784
+ - LIVB | Living Beings
2785
+ - OBJC | Objects
2786
+ - OCCU | Occupations
2787
+ - ORGA | Organizations
2788
+ - PHEN | Phenomena
2789
+ - PHYS | Physiology
2790
+ - PROC | Procedures
2791
+ """
2792
  )
2793
 
2794