Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -20,8 +20,10 @@ from typing import Dict
|
|
20 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
21 |
from collections import Counter
|
22 |
|
|
|
23 |
import torch
|
24 |
|
|
|
25 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
26 |
print(f"Device: {device}...")
|
27 |
if device.type == "cuda":
|
@@ -65,7 +67,9 @@ examples = [
|
|
65 |
models_List = ["Babelscape/wikineural-multilingual-ner", "urchade/gliner_large-v2.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1", "knowledgator/gliner-multitask-large-v0.5"
|
66 |
#models_List = ["NCBO/BioPortal" ]
|
67 |
|
68 |
-
categories_List = ["MED","LOC","PER","ORG","DATE","MISC"]
|
|
|
|
|
69 |
|
70 |
modelGliner=None
|
71 |
modelGlinerBio=None
|
@@ -366,6 +370,12 @@ def process_row_BioPortal_api(args, key_bioportal, row):
|
|
366 |
url = f"https://services.data.bioontology.org/annotatorplus/?text={context_to_annotate}&ontologies={onto_clauses}&longest_only=true&exclude_numbers=true&whole_word_only=true&exclude_synonyms=false&negation=false&experiencer=false&temporality=false&score_threshold=0&confidence_threshold=0&display_links=false&display_context=false&score=cvalue&apikey={key_bioportal}"
|
367 |
|
368 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
369 |
# args.KG_restriction does not exist or is empty
|
370 |
if strtobool(args.debug):
|
371 |
print("--- BIOPORTAL: " + context_to_annotate)
|
@@ -1148,6 +1158,12 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
|
|
1148 |
|
1149 |
else:
|
1150 |
# args.KG_restriction does not exist or is empty
|
|
|
|
|
|
|
|
|
|
|
|
|
1151 |
if strtobool(args.debug):
|
1152 |
print("--- " + word.lower())
|
1153 |
print("KG_restriction is not provided or empty - Consider all the KGs in the virtuoso endpoint")
|
@@ -1228,7 +1244,8 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
|
|
1228 |
|
1229 |
if not data:
|
1230 |
# nothing found from Bioportal
|
1231 |
-
return None, None, None, None, None, cache_map_virtuoso
|
|
|
1232 |
|
1233 |
dff = pd.DataFrame(data)
|
1234 |
dff = dff.drop(columns=['hierarchy', 'mappings'])
|
@@ -1276,7 +1293,8 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
|
|
1276 |
|
1277 |
if df_expanded.empty:
|
1278 |
# nothing found from Bioportal
|
1279 |
-
return None, None, None, None, None, cache_map_virtuoso
|
|
|
1280 |
|
1281 |
# Specify the columns you want to keep
|
1282 |
columns_to_keep = ["score", "from", "to", "prefLabel", "text", "@id"]
|
@@ -1335,13 +1353,15 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
|
|
1335 |
|
1336 |
else:
|
1337 |
#nothing found from Bioportal
|
1338 |
-
return None, None, None, None, None, cache_map_virtuoso
|
|
|
1339 |
|
1340 |
|
1341 |
except Exception as err:
|
1342 |
logging.error(
|
1343 |
f'ERROR ON BioPortal Annotator API Call\n\tError: {err}\n TextToAnnotate: {word.lower()}\n Have a check...')
|
1344 |
-
return None, None, None, None, None, cache_map_virtuoso
|
|
|
1345 |
|
1346 |
except Exception as err:
|
1347 |
|
@@ -1350,7 +1370,8 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
|
|
1350 |
# cache_map_virtuoso[word] = {}
|
1351 |
# cache_map_virtuoso[word][contextWordVirtuoso] = None
|
1352 |
|
1353 |
-
return None, None, None, None, None, cache_map_virtuoso
|
|
|
1354 |
|
1355 |
|
1356 |
return entityBioeUrl, ALLURIScontext, cache_map_virtuoso
|
@@ -1566,18 +1587,18 @@ def getLinearTextualContextFromTriples(word,labelTriplesLIST, text_splitter, arg
|
|
1566 |
try:
|
1567 |
|
1568 |
contextText = ""
|
1569 |
-
if args.service_provider == "gptjrc":
|
1570 |
-
|
1571 |
-
|
1572 |
-
|
1573 |
-
|
1574 |
-
|
1575 |
-
elif args.service_provider == "HFonPremises":
|
1576 |
-
|
1577 |
-
|
1578 |
-
|
1579 |
-
|
1580 |
-
|
1581 |
|
1582 |
|
1583 |
|
@@ -2257,7 +2278,7 @@ def elinking(df_annotated_combined, text_splitter, args, key_geonames, cache_map
|
|
2257 |
|
2258 |
|
2259 |
|
2260 |
-
def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking, KGchoices,
|
2261 |
|
2262 |
if EntityLinking:
|
2263 |
EnableNEL="True"
|
@@ -2266,7 +2287,12 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
|
|
2266 |
|
2267 |
if not text:
|
2268 |
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
|
2269 |
-
|
|
|
|
|
|
|
|
|
|
|
2270 |
|
2271 |
df_annotated = pd.DataFrame()
|
2272 |
|
@@ -2326,9 +2352,18 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
|
|
2326 |
|
2327 |
df_ToAnnotate = pd.DataFrame()
|
2328 |
|
2329 |
-
|
2330 |
-
|
2331 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2332 |
|
2333 |
for model_id in models_List: # always do all the annotations, only filter them afterwards
|
2334 |
#for model_id in ModelsSelection:
|
@@ -2377,7 +2412,11 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
|
|
2377 |
# If df_annotated is not empty, concatenate new_annotations to it
|
2378 |
df_annotated = pd.concat([df_annotated, new_annotations], ignore_index=True)
|
2379 |
|
2380 |
-
|
|
|
|
|
|
|
|
|
2381 |
|
2382 |
else:
|
2383 |
|
@@ -2392,9 +2431,16 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
|
|
2392 |
torch.cuda.manual_seed_all(args.SEED)
|
2393 |
###
|
2394 |
|
2395 |
-
history = pd.DataFrame(
|
2396 |
df_annotated = history.copy()
|
2397 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2398 |
|
2399 |
quoted_text = text.startswith('"') & text.endswith('"')
|
2400 |
if (not df_annotated.empty) or quoted_text:
|
@@ -2404,14 +2450,14 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
|
|
2404 |
df_annotated = df_annotated[df_annotated['model'].str.lower().isin([model.lower() for model in ModelsSelection])]
|
2405 |
if df_annotated.empty and quoted_text==False:
|
2406 |
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
|
2407 |
-
return {"text": text, "entities": []}, html_output,
|
2408 |
|
2409 |
df_annotated_combined = pd.DataFrame()
|
2410 |
if (not df_annotated.empty):
|
2411 |
df_annotated_combined = entitiesFusion(df_annotated,args)
|
2412 |
if df_annotated_combined.empty and quoted_text==False:
|
2413 |
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
|
2414 |
-
return {"text": text, "entities": []}, html_output,
|
2415 |
else:
|
2416 |
df_annotated_combined = is_cross_inside(df_annotated_combined, args, 0.999) #I cut all the cross inside with the 0.99. to avoid the linking
|
2417 |
|
@@ -2566,15 +2612,19 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
|
|
2566 |
[cat.lower() for cat in CategoriesSelection])
|
2567 |
if "MED" in CategoriesSelection:
|
2568 |
filter_mask |= df_annotated_combined['entity_group'].str.lower().isin(
|
2569 |
-
[cat.lower() for cat in CategoriesSelection])
|
2570 |
if "MISC" in CategoriesSelection:
|
2571 |
-
#filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List]))
|
2572 |
-
filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) & ~(df_annotated_combined['IsBio'] == 1) # with this cluase, I'm including not only the categories labelled as MISC, but also the other that are not MED, PER, ORG, LOC
|
|
|
|
|
|
|
|
|
2573 |
|
2574 |
df_annotated_combined = df_annotated_combined[filter_mask]
|
2575 |
if df_annotated_combined.empty:
|
2576 |
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
|
2577 |
-
return {"text": text, "entities": []}, html_output,
|
2578 |
|
2579 |
###
|
2580 |
|
@@ -2584,7 +2634,7 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
|
|
2584 |
df_annotated_combined = df_annotated_combined[df_annotated_combined['IsCrossInside'] != 1]
|
2585 |
if df_annotated_combined.empty:
|
2586 |
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
|
2587 |
-
return {"text": text, "entities": []}, html_output,
|
2588 |
|
2589 |
dict_annotated_combined_NER = df_annotated_combined[["end", "entity_group", "score", "start", "word"]].to_dict(orient="records")
|
2590 |
|
@@ -2608,9 +2658,8 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
|
|
2608 |
# 'word'], axis=1)
|
2609 |
df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
|
2610 |
lambda
|
2611 |
-
row: f"<a href='https://api-vast.jrc.service.ec.europa.eu/describe//?url={row['namedEntity']}' target='_blank'>{row['word']}</a>"
|
2612 |
-
'word'], axis=1)
|
2613 |
-
|
2614 |
|
2615 |
# Create a new dictionary with the entity information and the link
|
2616 |
dict_annotated_combined_NEL = df_annotated_combined[
|
@@ -2676,16 +2725,16 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
|
|
2676 |
# # f'<span style="background-color: {color}">{entity_text}</span>') - (end - start)
|
2677 |
|
2678 |
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text_with_links}</div>"
|
2679 |
-
return {"text": text, "entities": dict_annotated_combined_NER}, html_output,
|
2680 |
|
2681 |
else:
|
2682 |
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
|
2683 |
-
return {"text": text, "entities": dict_annotated_combined_NER}, html_output,
|
2684 |
|
2685 |
else:
|
2686 |
|
2687 |
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
|
2688 |
-
return {"text": text, "entities": []}, html_output,
|
2689 |
|
2690 |
|
2691 |
|
@@ -2709,13 +2758,37 @@ demo = gr.Interface(
|
|
2709 |
live=True,
|
2710 |
title="BioAnnotator: Biomedical Named-Entity Recognition (NER) and Linking (NEL)",
|
2711 |
description="""Interoperability – the capability of systems and organisations to cooperate across functional, sectoral and physical borders – is key for successful digital transformation.
|
2712 |
-
|
2713 |
-
|
2714 |
|
2715 |
-
|
2716 |
-
|
2717 |
examples=examples,
|
2718 |
-
cache_examples=False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2719 |
)
|
2720 |
|
2721 |
|
|
|
20 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
21 |
from collections import Counter
|
22 |
|
23 |
+
|
24 |
import torch
|
25 |
|
26 |
+
|
27 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
28 |
print(f"Device: {device}...")
|
29 |
if device.type == "cuda":
|
|
|
67 |
models_List = ["Babelscape/wikineural-multilingual-ner", "urchade/gliner_large-v2.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1", "knowledgator/gliner-multitask-large-v0.5"
|
68 |
#models_List = ["NCBO/BioPortal" ]
|
69 |
|
70 |
+
#categories_List = ["MED","LOC","PER","ORG","DATE","MISC"]
|
71 |
+
categories_List = ["MED","LOC","PER","ORG","DATE","MISC", "CONC", "BIOP", "ACTI", "ANAT", "CHEM", "DEVI", "DISO", "GENE", "GEOG", "LIVB", "OBJC", "OCCU", "ORGA", "PHEN", "PHYS" , "PROC"]
|
72 |
+
|
73 |
|
74 |
modelGliner=None
|
75 |
modelGlinerBio=None
|
|
|
370 |
url = f"https://services.data.bioontology.org/annotatorplus/?text={context_to_annotate}&ontologies={onto_clauses}&longest_only=true&exclude_numbers=true&whole_word_only=true&exclude_synonyms=false&negation=false&experiencer=false&temporality=false&score_threshold=0&confidence_threshold=0&display_links=false&display_context=false&score=cvalue&apikey={key_bioportal}"
|
371 |
|
372 |
else:
|
373 |
+
|
374 |
+
kg_restriction = getattr(args, 'KG_restriction', None)
|
375 |
+
if kg_restriction is not None and len(kg_restriction) == 0:
|
376 |
+
print("KG_restriction is provided but empty")
|
377 |
+
return pd.DataFrame()
|
378 |
+
|
379 |
# args.KG_restriction does not exist or is empty
|
380 |
if strtobool(args.debug):
|
381 |
print("--- BIOPORTAL: " + context_to_annotate)
|
|
|
1158 |
|
1159 |
else:
|
1160 |
# args.KG_restriction does not exist or is empty
|
1161 |
+
|
1162 |
+
kg_restriction = getattr(args, 'KG_restriction', None)
|
1163 |
+
if kg_restriction is not None and len(kg_restriction) == 0:
|
1164 |
+
print("KG_restriction is provided but empty")
|
1165 |
+
return None, None, cache_map_virtuoso
|
1166 |
+
|
1167 |
if strtobool(args.debug):
|
1168 |
print("--- " + word.lower())
|
1169 |
print("KG_restriction is not provided or empty - Consider all the KGs in the virtuoso endpoint")
|
|
|
1244 |
|
1245 |
if not data:
|
1246 |
# nothing found from Bioportal
|
1247 |
+
#return None, None, None, None, None, cache_map_virtuoso
|
1248 |
+
return None, None, cache_map_virtuoso
|
1249 |
|
1250 |
dff = pd.DataFrame(data)
|
1251 |
dff = dff.drop(columns=['hierarchy', 'mappings'])
|
|
|
1293 |
|
1294 |
if df_expanded.empty:
|
1295 |
# nothing found from Bioportal
|
1296 |
+
#return None, None, None, None, None, cache_map_virtuoso
|
1297 |
+
return None, None, cache_map_virtuoso
|
1298 |
|
1299 |
# Specify the columns you want to keep
|
1300 |
columns_to_keep = ["score", "from", "to", "prefLabel", "text", "@id"]
|
|
|
1353 |
|
1354 |
else:
|
1355 |
#nothing found from Bioportal
|
1356 |
+
#return None, None, None, None, None, cache_map_virtuoso
|
1357 |
+
return None, None, cache_map_virtuoso
|
1358 |
|
1359 |
|
1360 |
except Exception as err:
|
1361 |
logging.error(
|
1362 |
f'ERROR ON BioPortal Annotator API Call\n\tError: {err}\n TextToAnnotate: {word.lower()}\n Have a check...')
|
1363 |
+
#return None, None, None, None, None, cache_map_virtuoso
|
1364 |
+
return None, None, cache_map_virtuoso
|
1365 |
|
1366 |
except Exception as err:
|
1367 |
|
|
|
1370 |
# cache_map_virtuoso[word] = {}
|
1371 |
# cache_map_virtuoso[word][contextWordVirtuoso] = None
|
1372 |
|
1373 |
+
#return None, None, None, None, None, cache_map_virtuoso
|
1374 |
+
return None, None, cache_map_virtuoso
|
1375 |
|
1376 |
|
1377 |
return entityBioeUrl, ALLURIScontext, cache_map_virtuoso
|
|
|
1587 |
try:
|
1588 |
|
1589 |
contextText = ""
|
1590 |
+
# if args.service_provider == "gptjrc":
|
1591 |
+
# contextText = call_model(input_text=labelTriples, prompt=myPromt, model=args.model_name,
|
1592 |
+
# temperature=args.temperature, delimiter=myDelimiter,
|
1593 |
+
# InContextExamples=[],
|
1594 |
+
# handler=api_call_gptjrc,
|
1595 |
+
# verbose=True, args=args)
|
1596 |
+
# elif args.service_provider == "HFonPremises":
|
1597 |
+
# contextText = call_model(input_text=labelTriples, prompt=myPromt, model=args.model_name,
|
1598 |
+
# temperature=args.temperature, delimiter=myDelimiter,
|
1599 |
+
# InContextExamples=[],
|
1600 |
+
# handler=api_call_HFonPremises,
|
1601 |
+
# verbose=True, args=args)
|
1602 |
|
1603 |
|
1604 |
|
|
|
2278 |
|
2279 |
|
2280 |
|
2281 |
+
def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking, KGchoices, state: dict):
|
2282 |
|
2283 |
if EntityLinking:
|
2284 |
EnableNEL="True"
|
|
|
2287 |
|
2288 |
if not text:
|
2289 |
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
|
2290 |
+
state = {
|
2291 |
+
"text": "",
|
2292 |
+
"df_annotated_dict": dict(),
|
2293 |
+
"KGchoices": KGchoices
|
2294 |
+
}
|
2295 |
+
return {"text": text, "entities": []}, html_output, state
|
2296 |
|
2297 |
df_annotated = pd.DataFrame()
|
2298 |
|
|
|
2352 |
|
2353 |
df_ToAnnotate = pd.DataFrame()
|
2354 |
|
2355 |
+
previous_text = ""
|
2356 |
+
previous_df_annotated_dict = dict()
|
2357 |
+
previous_kg_choices = []
|
2358 |
+
if state:
|
2359 |
+
previous_text = state.get("text", "")
|
2360 |
+
previous_df_annotated_dict = state.get("df_annotated_dict", {})
|
2361 |
+
previous_kg_choices = state.get("KGchoices", [])
|
2362 |
+
|
2363 |
+
# print("Are all models in any row of the 'model' column, case-insensitively?", all_models_in_any_row)
|
2364 |
+
# if (not history_dict) or (history_dict[args.source_column][0] != text) or (all_models_in_any_row == False):
|
2365 |
+
# if (not history_dict) or (history_dict[args.source_column][0] != text):
|
2366 |
+
if (not previous_df_annotated_dict) or (previous_text != text) or (sorted(previous_kg_choices) != sorted(KGchoices)):
|
2367 |
|
2368 |
for model_id in models_List: # always do all the annotations, only filter them afterwards
|
2369 |
#for model_id in ModelsSelection:
|
|
|
2412 |
# If df_annotated is not empty, concatenate new_annotations to it
|
2413 |
df_annotated = pd.concat([df_annotated, new_annotations], ignore_index=True)
|
2414 |
|
2415 |
+
state = {
|
2416 |
+
"text": text,
|
2417 |
+
"df_annotated_dict": df_annotated.to_dict(),
|
2418 |
+
"KGchoices": KGchoices
|
2419 |
+
}
|
2420 |
|
2421 |
else:
|
2422 |
|
|
|
2431 |
torch.cuda.manual_seed_all(args.SEED)
|
2432 |
###
|
2433 |
|
2434 |
+
history = pd.DataFrame(previous_df_annotated_dict)
|
2435 |
df_annotated = history.copy()
|
2436 |
|
2437 |
+
state = {
|
2438 |
+
"text": text,
|
2439 |
+
"df_annotated_dict": df_annotated.to_dict(),
|
2440 |
+
"KGchoices": KGchoices
|
2441 |
+
}
|
2442 |
+
|
2443 |
+
|
2444 |
|
2445 |
quoted_text = text.startswith('"') & text.endswith('"')
|
2446 |
if (not df_annotated.empty) or quoted_text:
|
|
|
2450 |
df_annotated = df_annotated[df_annotated['model'].str.lower().isin([model.lower() for model in ModelsSelection])]
|
2451 |
if df_annotated.empty and quoted_text==False:
|
2452 |
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
|
2453 |
+
return {"text": text, "entities": []}, html_output, state
|
2454 |
|
2455 |
df_annotated_combined = pd.DataFrame()
|
2456 |
if (not df_annotated.empty):
|
2457 |
df_annotated_combined = entitiesFusion(df_annotated,args)
|
2458 |
if df_annotated_combined.empty and quoted_text==False:
|
2459 |
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
|
2460 |
+
return {"text": text, "entities": []}, html_output, state
|
2461 |
else:
|
2462 |
df_annotated_combined = is_cross_inside(df_annotated_combined, args, 0.999) #I cut all the cross inside with the 0.99. to avoid the linking
|
2463 |
|
|
|
2612 |
[cat.lower() for cat in CategoriesSelection])
|
2613 |
if "MED" in CategoriesSelection:
|
2614 |
filter_mask |= df_annotated_combined['entity_group'].str.lower().isin(
|
2615 |
+
[cat.lower() for cat in CategoriesSelection]) & (df_annotated_combined['IsBio'] == 1)
|
2616 |
if "MISC" in CategoriesSelection:
|
2617 |
+
# filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List]))
|
2618 |
+
# filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) & ~(df_annotated_combined['IsBio'] == 1) # with this cluase, I'm including not only the categories labelled as MISC, but also the other that are not MED, PER, ORG, LOC
|
2619 |
+
filter_mask |= ~(
|
2620 |
+
df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) & ~(
|
2621 |
+
df_annotated_combined[
|
2622 |
+
'IsBio'] == 1) # with this cluase, I'm including not only the categories labelled as MISC, but also the other that are not MED, PER, ORG, LOC
|
2623 |
|
2624 |
df_annotated_combined = df_annotated_combined[filter_mask]
|
2625 |
if df_annotated_combined.empty:
|
2626 |
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
|
2627 |
+
return {"text": text, "entities": []}, html_output, state
|
2628 |
|
2629 |
###
|
2630 |
|
|
|
2634 |
df_annotated_combined = df_annotated_combined[df_annotated_combined['IsCrossInside'] != 1]
|
2635 |
if df_annotated_combined.empty:
|
2636 |
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
|
2637 |
+
return {"text": text, "entities": []}, html_output, state
|
2638 |
|
2639 |
dict_annotated_combined_NER = df_annotated_combined[["end", "entity_group", "score", "start", "word"]].to_dict(orient="records")
|
2640 |
|
|
|
2658 |
# 'word'], axis=1)
|
2659 |
df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
|
2660 |
lambda
|
2661 |
+
row: f"<a href='https://api-vast.jrc.service.ec.europa.eu/describe//?url={row['namedEntity']}' target='_blank'>{row['word']}</a>" if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[
|
2662 |
+
'word'], axis=1)
|
|
|
2663 |
|
2664 |
# Create a new dictionary with the entity information and the link
|
2665 |
dict_annotated_combined_NEL = df_annotated_combined[
|
|
|
2725 |
# # f'<span style="background-color: {color}">{entity_text}</span>') - (end - start)
|
2726 |
|
2727 |
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text_with_links}</div>"
|
2728 |
+
return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state
|
2729 |
|
2730 |
else:
|
2731 |
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
|
2732 |
+
return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state
|
2733 |
|
2734 |
else:
|
2735 |
|
2736 |
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
|
2737 |
+
return {"text": text, "entities": []}, html_output, state
|
2738 |
|
2739 |
|
2740 |
|
|
|
2758 |
live=True,
|
2759 |
title="BioAnnotator: Biomedical Named-Entity Recognition (NER) and Linking (NEL)",
|
2760 |
description="""Interoperability – the capability of systems and organisations to cooperate across functional, sectoral and physical borders – is key for successful digital transformation.
|
2761 |
+
The [Interoperable Europe Act](https://interoperable-europe.ec.europa.eu/interoperable-europe/interoperable-europe-act) is an EU regulation that aims to strengthen public sector interoperability and will serve as a main EC policy framework for the years to come.
|
2762 |
+
Data exchange is vital for digital government policies, and semantic interoperability ensures systems understand each other despite different legacies and architectures.
|
2763 |
|
2764 |
+
In this demo we show in particular the *BioAnnotator*, a prototype tool performing Biomedical Named-Entity Recognition (NER) and Linking (NEL). To give it a try, please select one or more NER models and enter some text to get it processed. Please select also the entity categories you want to extract, as well as the score to use as a threshold for the NER extraction. Finally, select whether you want to perform Named-Entity Linking (NEL) and if you want to enable the filtering to some specific biomedical ontologies only (acronyms description at: https://bioportal.bioontology.org/ontologies). See also: [InventoryHealthKGs.pdf](https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/ETOHA/KGs/InventoryHealthKGs.pdf).
|
2765 |
+
""",
|
2766 |
examples=examples,
|
2767 |
+
cache_examples=False,
|
2768 |
+
article="""
|
2769 |
+
**Categories Legend:**
|
2770 |
+
- MED | Medical
|
2771 |
+
- LOC | Locations
|
2772 |
+
- PER | Persons
|
2773 |
+
- ORG | Organizations
|
2774 |
+
- MISC | Miscellanea
|
2775 |
+
- CONC | Concepts & Ideas
|
2776 |
+
- BIOP | Biological
|
2777 |
+
- ACTI | Activities & Behaviors
|
2778 |
+
- ANAT | Anatomy
|
2779 |
+
- CHEM | Chemicals & Drugs
|
2780 |
+
- DEVI | Devices
|
2781 |
+
- DISO | Disorders
|
2782 |
+
- GENE | Genes & Molecular Sequences
|
2783 |
+
- GEOG | Geographic Areas
|
2784 |
+
- LIVB | Living Beings
|
2785 |
+
- OBJC | Objects
|
2786 |
+
- OCCU | Occupations
|
2787 |
+
- ORGA | Organizations
|
2788 |
+
- PHEN | Phenomena
|
2789 |
+
- PHYS | Physiology
|
2790 |
+
- PROC | Procedures
|
2791 |
+
"""
|
2792 |
)
|
2793 |
|
2794 |
|