Spaces:
Running
Running
File size: 53,424 Bytes
232b620 66b8c66 232b620 5a9842d 232b620 5a9842d 232b620 5a9842d 232b620 5a9842d 232b620 66b8c66 232b620 193f79d 232b620 eff93c6 232b620 66b8c66 232b620 66b8c66 232b620 66b8c66 232b620 66b8c66 232b620 66b8c66 232b620 66b8c66 232b620 66b8c66 232b620 66b8c66 232b620 66b8c66 232b620 5a9842d 232b620 5a9842d 232b620 66b8c66 232b620 66b8c66 232b620 66b8c66 232b620 66b8c66 232b620 66b8c66 232b620 66b8c66 232b620 66b8c66 232b620 66b8c66 232b620 66b8c66 232b620 66b8c66 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 |
import os
#os.environ["CUDA_VISIBLE_DEVICES"] = "1,6" # to use the GPUs 3,4 only
#os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
#os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
#os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
from transformers import file_utils
print(file_utils.default_cache_path)
import pandas as pd
from tqdm import tqdm
from gliner import GLiNER
import logging
from jinja2 import Template
from collections import Counter
from transformers import pipeline, AutoTokenizer
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
#import html
import torch
torch.cuda.empty_cache() # Clear cache ot torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}...")
if device.type == "cuda":
print("GPU number:", torch.cuda.current_device())
import datasets
import argparse
import json
import random
import numpy as np
import tiktoken
from langchain.text_splitter import TokenTextSplitter
import gradio as gr
import re
from common import strtobool, token_counter, encoding_getter, strip_quotes
from nerBio import annotate, entitiesFusion, is_cross_inside, elinking
from llmqueryNer import call_model, call_model_with_caching, process_list, setup_gptjrc, api_call_gptjrc, model_list_gptjrc
from joblib import Memory
cachedir = 'cached'
mem = Memory(cachedir, verbose=False)
# this is to completely delete the cache:
# mem.clear(warn=False)
examples = [
["He said the disease was 1st detected to have spread to Malaysia from Africa in 1997. " , None],
["He said the disease which was 1st detected in Johor had spread to Negeri Sembilan, Melaka, Perak, Selangor and the latest Kedah. He said the disease was 1st detected to have spread to Malaysia from Africa in 1997. ", None],
["The Health Ministry has detected about 4000 suspected chikungunya cases nationwide this year [2008], Minister Datuk Liow Tiong Lai said Wednesday [17 Dec 2008]. ", None],
["The Health Ministry has detected about 4000 suspected chikungunya cases nationwide this year [2008], Minister Datuk Liow Tiong Lai said Wednesday [17 Dec 2008]. He said the disease which was 1st detected in Johor had spread to Negeri Sembilan, Melaka, Perak, Selangor and the latest Kedah. \"So far, the chikungunya disease is still under control nationwide,\" he told reporters after visiting Sultanah Nur Zahirah Hospital here. Present was Terengganu Health Director Dr. Nordiyanah Hassan. Liow said that so far, there is no specific medicine to treat the chikungunya fever disease spread by _Aedes_ mosquito. \"So, I would like to call on the public to be careful particularly during the wet season now because _Aedes_ mosquito is easy to breed,\" he said. To contain the spread of the disease, he said, the ministry had taken several measures including intensifying the campaign to rid of _Aedes_ mosquito and holding lectures on the outbreak. He said the disease was 1st detected to have spread to Malaysia from Africa in 1997. Meanwhile, he said 63 health projects costing RM458 million [USD 131 230 211] had been approved for implementation in Terengganu under the Ninth Malaysia Plan and some had started.", None],
["Carcinoma", None],
["The doctor diagnosed the patient with basal cell carcinoma, a common type of skin cancer.", None],
["West Nile virus", None],
["Legionellosis", None],
["Eight years ago I started with Fosamax for 3-4 years and then took Actonel. In March, I decided not to take Actonel any longer. I had been on it for too long and was fearful of esophageal cancer and bone breakage. Now my doctor wants me to take the Prolia injections, which I am not going to do. I am not going to continue with any drugs. My bone density recently done was in the minuses. I do work with a personal trainer and execise daily. I am searching for alternative ways to deal with this problem.", None],
["Does Chicago have any stores and does Joe live here?", None],
["Cholera has been reported every week since November 1994. By 5 November 1995 at total of 12,344 with 245 deaths have been notified. Of these, 879 cases with 4 deaths were reported for the period 9 October to 5 November 1995. Control efforts have not succeeded in preventing the spread of the epidemic and when cases were detected on Sao Nicolau and Sal Islands in the period 9 October to 5 November all nine inhabited islands of Cap Verde had become infected. The last cholera epidemic in Cap Verde occurred in 1979. (See also Weekly Epidemiological Record No. 44, 3 November 1995) CΓ΄te d'Ivoire: A cholera outbreak which started in September 1995 caused 2,027 cases and 150 deaths up to 12 November 1995. The first cases were reported in Department de l'Ouest on 18 September 1995. Cases were subsequently reported in Department de Nord and most recently in Department du Centre and Department de Sud. The WHO Representative assisted in the organization of a team to visit the area and evaluate the situation as well as arranging for medical supplies. (1.12.95) Iran, Islamic Republic of,: Kordestan Province has been declared free of cholera. (1.12.95) Iraq: An outbreak of cholera reported from Sulaimaniyah Governorate in Northern Iraq has resulted in 519 cases, 264 of which have been confirmed, and 3 deaths to date. Vibrio cholerae O1 serotype Ogawa has been isolated. At the request of the Iraqi Ministry of Health, a WHO consultant has been sent to the area to assess and monitor the situation, provide guidance to the health authorities, and coordinate inputs by non-governmental organizations. WHO has also made available essential treatment supplies. An intensive media campaign to raise public awareness about essential preventive measures has been successful in containing the spread of the outbreak. (1.12.95) Senegal: Despite the fact that cholera has been endemic in countries bordering Senegal for the past two years, no cases were reported from Senegal until mid- August 1995. Between 15 August and 17 November 1995, 852 case and 43 deaths were notified. A further 731 cases with 37 deaths have been reported for the period 1 September to 12 November. Most cases were in the Departments of Dakar and Pikine in the Dakar Region and recently also Departments of Mbacke and Touba in Diourbel Region. ", None],
]
models_List = ["FacebookAI/xlm-roberta-large-finetuned-conll03-english", "Babelscape/wikineural-multilingual-ner", "blaze999/Medical-NER", "urchade/gliner_large-v2.1", "urchade/gliner_large_bio-v0.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1", "knowledgator/gliner-multitask-large-v0.5"
#models_List = ["Babelscape/wikineural-multilingual-ner", "urchade/gliner_large-v2.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1", "knowledgator/gliner-multitask-large-v0.5"
#models_List = ["NCBO/BioPortal" ]
#categories_List = ["MED","LOC","PER","ORG","DATE","MISC"]
categories_List = ["MED","LOC","PER","ORG","DATE","MISC", "CONC", "BIOP", "ACTI", "ANAT", "CHEM", "DEVI", "DISO", "GENE", "GEOG", "LIVB", "OBJC", "OCCU", "ORGA", "PHEN", "PHYS" , "PROC"]
POSSIBLE_KGchoices_List = ["AI", "AIO", "AEO", "BFO", "BIM", "BCGO", "CL", "CHIRO", "CHEBI", "DCM", "FMA", "GO", "GENO",
"GeoSPARQL", "HL7", "DOID", "HP", "HP_O", "IDO", "IAO", "ICD10", "LOINC", "MESH",
"MONDO", "NCIT", "NCBITAXON", "NCBITaxon_", "NIFCELL", "NIFSTD", "GML", "OBCS", "OCHV", "OHPI",
"OPB", "TRANS", "PLOSTHES", "RADLEX", "RO", "STY", "SO", "SNOMED", "STATO",
"SYMP", "FoodOn", "UBERON", "ORDO", "HOOM", "VO", "OGMS", "EuroSciVoc"]
modelGliner=None
modelGlinerBio=None
num_cores_Gliner_forDemo = 0 # 0 means use the GPU for Gliner !
tokenizerGliner = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')
encod = encoding_getter('microsoft/deberta-v3-large')
text_splitter = TokenTextSplitter(
# separators=separators,
encoding_name=encod.name,
chunk_size=80000,
chunk_overlap=50,
length_function=len,
add_start_index=True,
)
pipe_dict = {}
for modelName in models_List:
tsk = "token-classification"
if (("/gliner" in modelName) == False) and (("NCBO" in modelName) == False):
pipe = pipeline(
tsk,
model=modelName,
aggregation_strategy="simple",
device=device,
)
pipe_dict[modelName] = pipe
elif ("/gliner" in modelName):
if not tokenizerGliner:
tokenizerGliner = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')
if "_bio-" in modelName:
if num_cores_Gliner_forDemo > 0:
modelGlinerBio = GLiNER.from_pretrained(modelName) # "urchade/gliner_large_bio-v0.1")
else:
modelGlinerBio = GLiNER.from_pretrained(modelName, map_location=device)
else:
if num_cores_Gliner_forDemo > 0:
modelGliner = GLiNER.from_pretrained(
modelName) # "knowledgator/gliner-multitask-large-v0.5" - "urchade/gliner_large-v2.1"
else:
modelGliner = GLiNER.from_pretrained(modelName, map_location=device)
#### GPT@JRC API
#if args.service_provider == "gptjrc":
key_gptjrc = ""
fkeyname = "GPTJRC-APItoken.key"
if os.path.exists(fkeyname):
with open(fkeyname) as f:
key_gptjrc = f.read()
else:
key_gptjrc = os.environ['key_gptjrc']
if key_gptjrc and key_gptjrc != "":
setup_gptjrc(key_gptjrc)
#####
# Add this function to handle dropdown selection
def get_urls(word, df_annotated_combined):
# Filter the DataFrame to get rows where 'ALLURIScontext' is not empty or None
#valid_entries = df_annotated_combined[df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [])]
valid_entries = df_annotated_combined[df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [] and (
isinstance(x, list) and (isinstance(x, list) and len(x) > 0) and (not (len(x) == 1 and not str(x[0]).strip()))))]
# Check if the word is in the filtered DataFrame
if word in valid_entries['word'].values:
urls = valid_entries.loc[valid_entries['word'] == word, 'ALLURIScontext'].values[0]
if 'namedEntity' in df_annotated_combined.columns:
firsturlinlist = df_annotated_combined.loc[df_annotated_combined['word'] == word, 'namedEntity']
firsturlinlist = firsturlinlist.iloc[0] if not firsturlinlist.empty else None
if firsturlinlist and firsturlinlist in urls:
# Remove the URL from its current position
urls.remove(firsturlinlist)
# Insert the URL at the first position
urls.insert(0, firsturlinlist)
#html_links = "<br>".join([f'<a href="https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={url}" target="_blank">{url}</a>' for url in urls])
html_links = "<br>".join([f'<a href="https://api-vast.jrc.service.ec.europa.eu/describe//?url={url}" target="_blank">{url}</a>' for url in urls])
return html_links
return ""
###@mem.cache
def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking, KGchoices, state: dict):
if EntityLinking:
EnableNEL="True"
else:
EnableNEL="False"
if not text:
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
state = {
"text": "",
"df_annotated_dict": dict(),
"df_annotated_combined_dict": dict(),
"KGchoices": KGchoices,
"ModelsSelection": ModelsSelection,
"ScoreFilt": ScoreFilt,
"EntityLinking": EntityLinking,
"html_output": html_output
}
return {"text": text, "entities": []}, html_output, state, [], ""
df_annotated = pd.DataFrame()
parser = argparse.ArgumentParser()
parser.add_argument("--model_id", type=str, default=models_List[0], help="model to use")
parser.add_argument("--debug", type=str, default="True", help="set debug mode")
parser.add_argument("--source_column", type=str, default="ContextToAnnotate")
parser.add_argument("--entities_filter_threshold", type=int, default=ScoreFilt)
parser.add_argument("--SEED", type=int, default=41)
parser.add_argument("--batch_size", type=int, default=32) # 4 - 8 - 16
parser.add_argument("--num_cores_Gliner", type=int, default=num_cores_Gliner_forDemo, help="parallel processing for Gliner annotation") # 0 means use the GPU for Gliner !
parser.add_argument("--entity_linking", type=str, default=EnableNEL, help="whether to make entities linking or not")
parser.add_argument("--geonameskey_filename", type=str, default="GEONAMES-API.key", help="file location where it is stored the geonames api key")
parser.add_argument("--virtuosokey_filename", type=str, default="VIRTUOSO-dba.key", help="file location where it is stored the virtuoso endpoint dba pwd")
parser.add_argument("--bioportalkey_filename", type=str, default="NCBO-BioPortal.key", help="file location where it is stored the NCBO BioPortal api key")
# consose 20250205:
# KGchoices = None
# KGchoices = ['SNOMED', 'LOINC', 'ICD10', 'NCIT']
# KGchoices = ['SNOMED', 'LOINC', 'ICD10', 'MESH', 'NCIT'] # restricts the input to these values only
if KGchoices:
KGchoices.sort()
parser.add_argument("--KG_restriction", nargs='+', choices=KGchoices, default=KGchoices,
help="List of ontologies to which restrict the entity linking task.")
#consose 20250502:
if Counter(KGchoices) == Counter(POSSIBLE_KGchoices_List):
parser.add_argument("--USE_CACHE", type=str, default="True",
help="whether to use cache for the NER and NEL tasks or not")
else:
#print("Lists do not have the same elements")
parser.add_argument("--USE_CACHE", type=str, default="False",
help="whether to use cache for the NER and NEL tasks or not")
parser.add_argument("--num_cores_eLinking", type=int, default=10, help="parallel processing for the entity linking process")
parser.add_argument("--computeEntityContext", type=str, default="False",
help="whether to extract a readable context from the extracted triples for the concept")
parser.add_argument("--computeEntityGlobalContext", type=str, default="False",
help="whether to extract a readable context from the extracted triples of all the entities extracted from the endpoint for the concept")
parser.add_argument("--maxTriplesGlobalContext", type=int, default=20000,
help="maximum number of triples to consider for global context computation") # if 0 or None it is not considered
parser.add_argument("--UseRetrieverForContextCreation", type=str, default="True",
help="whether to use a retriever for the creation of the context of the entities from the triples coming from the KGs")
parser.add_argument("--service_provider", type=str, default="gptjrc", help="llm service provider")
parser.add_argument("--model_name", type=str, default="llama-3.1-70b-instruct", help="llm to use")
parser.add_argument("--tokens_max", type=int, default=80000, help="max number of tokens to supply to the llm")
parser.add_argument("--temperature", type=int, default=0.01)
args = parser.parse_args()
df_ToAnnotate = pd.DataFrame()
previous_text = ""
previous_df_annotated_dict = dict()
previous_kg_choices = []
if state:
previous_text = state.get("text", "")
previous_df_annotated_dict = state.get("df_annotated_dict", {})
previous_df_annotated_combined_dict = state.get("df_annotated_combined_dict", {})
previous_kg_choices = state.get("KGchoices", [])
previous_ModelsSelection = state.get("ModelsSelection", [])
previous_ScoreFilt_from_state = float(state.get("ScoreFilt", ScoreFilt)) # Ensure ScoreFilt is a float
previous_EntityLinking_from_state = bool(state.get("EntityLinking", EntityLinking)) # Ensure EntityLinking is a boolean
previous_html_output = state.get("html_output", "")
if previous_html_output and (previous_df_annotated_dict) and (previous_df_annotated_combined_dict) and (previous_text == text) and (sorted(previous_kg_choices) == sorted(KGchoices)) and (sorted(previous_ModelsSelection) == sorted(ModelsSelection)) and (previous_ScoreFilt_from_state == ScoreFilt) and (previous_EntityLinking_from_state == EntityLinking):
ddf_annot_prev = pd.DataFrame(previous_df_annotated_combined_dict)
if 'ALLURIScontext' in ddf_annot_prev.columns:
# words_for_dropdown = df_annotated_combined[
# df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [])][
# 'word'].unique().tolist()
words_for_dropdown = ddf_annot_prev[ddf_annot_prev['ALLURIScontext'].apply(
lambda x: x is not None and x != [] and (isinstance(x, list) and len(x) > 0) and (
isinstance(x, list) and (not (len(x) == 1 and not str(x[0]).strip()))))][
'word'].unique().tolist()
words_for_dropdown = list({entry.lower(): entry for entry in words_for_dropdown}.values())
words_for_dropdown.insert(0, "")
else:
words_for_dropdown = []
dict_annotated_combined_NER = ddf_annot_prev[
["end", "entity_group", "score", "start", "word"]].to_dict(orient="records")
# return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state
return {"text": text, "entities": dict_annotated_combined_NER}, previous_html_output, state, gr.update(
choices=words_for_dropdown), ""
#print("Are all models in any row of the 'model' column, case-insensitively?", all_models_in_any_row)
#if (not history_dict) or (history_dict[args.source_column][0] != text) or (all_models_in_any_row == False):
#if (not history_dict) or (history_dict[args.source_column][0] != text):
if (not previous_df_annotated_dict) or (previous_text != text) or (sorted(previous_kg_choices) != sorted(KGchoices) ):
for model_id in models_List: # always do all the annotations, only filter them afterwards
#for model_id in ModelsSelection:
# if history_dict and (history_dict[args.source_column][0] == text):
# if model_id in hhist['model'].unique():
# continue
parser.set_defaults(model_id=model_id)
args = parser.parse_args()
print("ARGS:")
print(args)
# %% n machine learning tasks, particularly when dealing with models that have stochasticity involved (like text generation), it's important to set seeds for random number generators to ensure reproducibility of results. In the case of using models from the transformers library, you need to set seeds for both Python's random module, NumPy, and PyTorch to ensure that the results are the same every time you run the code.
# Before you create the pipeline and run the text generation, set the seeds like this:
random.seed(args.SEED)
np.random.seed(args.SEED)
torch.manual_seed(args.SEED)
torch.cuda.manual_seed_all(args.SEED)
###
df_ToAnnotate = pd.DataFrame({ "ToLink": [None], args.source_column: [text]})
if "SentenceRef" not in df_ToAnnotate.columns:
df_ToAnnotate["SentenceRef"] = None
df_ToAnnotate = df_ToAnnotate[['SentenceRef'] + [col for col in df_ToAnnotate.columns if
col != 'SentenceRef']] # this moves it to the first position
df_ToAnnotate['SentenceRef'] = df_ToAnnotate.index + 1
df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].argsort().groupby(df_ToAnnotate[args.source_column]).transform('min').astype(int)
df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].rank(method='dense').astype(int)
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# if strtobool(args.debug):
# print(f"Device: {device}...")
# if device.type == "cuda":
# print("GPU number:", torch.cuda.current_device())
pipeToUse = None
if (("gliner" in args.model_id) == False) and (("NCBO" in args.model_id)== False) :
pipeToUse = pipe_dict[args.model_id]
new_annotations = annotate(df_ToAnnotate, args, pipeToUse, tokenizerGliner, modelGliner, modelGlinerBio, device)
if not new_annotations.empty:
if df_annotated.empty:
# If df_annotated is empty, just assign new_annotations to it
df_annotated = new_annotations
else:
# If df_annotated is not empty, concatenate new_annotations to it
df_annotated = pd.concat([df_annotated, new_annotations], ignore_index=True)
state = {
"text": text,
"df_annotated_dict": df_annotated.to_dict(),
"df_annotated_combined_dict": dict(),
"KGchoices": KGchoices,
"ModelsSelection": ModelsSelection,
"ScoreFilt": ScoreFilt,
"EntityLinking": EntityLinking,
"html_output": ""
}
else:
print("ARGS:")
print(args)
# %% n machine learning tasks, particularly when dealing with models that have stochasticity involved (like text generation), it's important to set seeds for random number generators to ensure reproducibility of results. In the case of using models from the transformers library, you need to set seeds for both Python's random module, NumPy, and PyTorch to ensure that the results are the same every time you run the code.
# Before you create the pipeline and run the text generation, set the seeds like this:
random.seed(args.SEED)
np.random.seed(args.SEED)
torch.manual_seed(args.SEED)
torch.cuda.manual_seed_all(args.SEED)
###
history = pd.DataFrame(previous_df_annotated_dict)
df_annotated = history.copy()
state = {
"text": text,
"df_annotated_dict": df_annotated.to_dict(),
"df_annotated_combined_dict": dict(),
"KGchoices": KGchoices,
"ModelsSelection": ModelsSelection,
"ScoreFilt": ScoreFilt,
"EntityLinking": EntityLinking,
"html_output": ""
}
quoted_text = text.startswith('"') & text.endswith('"')
if (not df_annotated.empty) or quoted_text:
if (not df_annotated.empty):
# filter now per models selection
df_annotated = df_annotated[df_annotated['model'].str.lower().isin([model.lower() for model in ModelsSelection])]
if df_annotated.empty and quoted_text==False:
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
state["html_output"] = html_output
return {"text": text, "entities": []}, html_output, state, [], ""
df_annotated_combined = pd.DataFrame()
if (not df_annotated.empty):
df_annotated_combined = entitiesFusion(df_annotated,args)
if df_annotated_combined.empty and quoted_text==False:
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
state["html_output"] = html_output
return {"text": text, "entities": []}, html_output, state, [], ""
else:
if (not df_annotated.empty):
df_annotated_combined = is_cross_inside(df_annotated_combined, args, 0.999) #I cut all the cross inside with the 0.99. to avoid the linking
cache_prefix_fp = "LLMQUERYNER"
cache_nameLLMs = cache_prefix_fp + "___" + "__".join(
[args.service_provider, args.model_name, str(args.temperature)]).replace(
" ", "_") + ".json"
load_map_query_input_output = None
if strtobool(args.USE_CACHE):
if os.path.exists(cache_nameLLMs):
with open(cache_nameLLMs) as f:
load_map_query_input_output = json.load(f)
else:
load_map_query_input_output = {}
### entity linking part:
if strtobool(args.entity_linking):
cache_map_geonames = None
if strtobool(args.USE_CACHE):
cache_filename = "CACHE_geonames.json"
if os.path.exists(cache_filename):
with open(cache_filename) as f:
cache_map_geonames = json.load(f)
else:
cache_map_geonames = {}
key_geonames = ""
if args.geonameskey_filename and os.path.exists(args.geonameskey_filename):
fkeyname = args.geonameskey_filename
with open(fkeyname) as f:
key_geonames = f.read()
else:
key_geonames = os.environ['key_geonames']
cache_map_virtuoso = None
if strtobool(args.USE_CACHE):
cacheVirtuoso_filename = "CACHE_virtuoso.json"
if os.path.exists(cacheVirtuoso_filename):
with open(cacheVirtuoso_filename) as f:
cache_map_virtuoso = json.load(f)
else:
cache_map_virtuoso = {}
key_virtuoso = ""
if args.virtuosokey_filename and os.path.exists(args.virtuosokey_filename):
fkeyname = args.virtuosokey_filename
with open(fkeyname) as f:
key_virtuoso = f.read()
else:
key_virtuoso = os.environ['key_virtuoso']
# Here for the EXACT MATCHING "" - if the desired term has not been identified in the NER, add to the dataframe:
if df_ToAnnotate.empty:
df_ToAnnotate = pd.DataFrame({"ToLink": [None], args.source_column: [text]})
if "SentenceRef" not in df_ToAnnotate.columns:
df_ToAnnotate["SentenceRef"] = None
df_ToAnnotate = df_ToAnnotate[['SentenceRef'] + [col for col in df_ToAnnotate.columns if
col != 'SentenceRef']] # this moves it to the first position
df_ToAnnotate['SentenceRef'] = df_ToAnnotate.index + 1
df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].argsort().groupby(
df_ToAnnotate[args.source_column]).transform('min').astype(int)
df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].rank(method='dense').astype(int)
# Define the condition to find missing SentenceRefs
missing_sentence_refs = ~df_ToAnnotate['SentenceRef'].isin(df_annotated_combined['SentenceRef'])
# Define the condition to check if ContextToAnnotate starts and ends with quotes
quoted_context = df_ToAnnotate[args.source_column].str.startswith('"') & df_ToAnnotate[
args.source_column].str.endswith('"')
# Combine both conditions
condition = missing_sentence_refs & quoted_context
# Select rows from df_ToAnnotate that meet the condition
rows_to_add = df_ToAnnotate[condition]
rows_to_add['model'] = "Forced"
rows_to_add['entity_group'] = "MISC"
rows_to_add['word'] = rows_to_add[args.source_column]
rows_to_add['word'] = rows_to_add[args.source_column].apply(strip_quotes)
rows_to_add['score'] = 1.0
rows_to_add['start'] = int(1)
rows_to_add['end'] = rows_to_add['word'].apply(len) + int(1)
rows_to_add['IsGeo'] = None
rows_to_add['IsBio'] = None
rows_to_add['IsCrossInside'] = 0.0
if df_annotated_combined.empty:
df_annotated_combined = pd.DataFrame(columns=df_ToAnnotate.columns)
# Append these rows to df_annotated_combined
df_annotated_combined = pd.concat([df_annotated_combined, rows_to_add], ignore_index=True)
df_annotated_combined['start'] = df_annotated_combined['start'].astype(int)
df_annotated_combined['end'] = df_annotated_combined['end'].astype(int)
df_annotated_combined = df_annotated_combined.sort_values(
by=['SentenceRef', 'start', 'ToLink', 'word', 'score'],
ascending=[True, True, True, True, False])
# Now df_annotated_combined contains the additional rows
df_annotated_combined, cache_map_geonames_AFTER, cache_map_virtuoso_AFTER, load_map_query_input_output_AFTER = elinking(df_annotated_combined,
text_splitter, args, key_geonames,
cache_map_geonames,
key_virtuoso,
cache_map_virtuoso,
load_map_query_input_output,
device)
if strtobool(args.USE_CACHE):
if cache_map_geonames_AFTER is not None:
with open(cache_filename, "w") as f:
json.dump(cache_map_geonames_AFTER, f)
if cache_map_virtuoso_AFTER is not None:
with open(cacheVirtuoso_filename, "w") as f:
json.dump(cache_map_virtuoso_AFTER, f)
if load_map_query_input_output_AFTER is not None:
with open(cache_nameLLMs, "w") as f:
json.dump(load_map_query_input_output_AFTER, f)
### end entity linking part
### filter by selected category only
# #df_annotated_combined = df_annotated_combined[df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in CategoriesSelection])]
# if "MED" in CategoriesSelection:
# filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
# [cat.lower() for cat in CategoriesSelection]) | (df_annotated_combined['IsBio'] == 1)
# else:
# filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
# [cat.lower() for cat in CategoriesSelection])
# df_annotated_combined = df_annotated_combined[filter_mask]
#
# if "MED" in CategoriesSelection:
# filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
# [cat.lower() for cat in CategoriesSelection]) | (df_annotated_combined['IsBio'] == 1)
# elif "OTHER" in CategoriesSelection:
# filter_mask = ~(
# df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List]))
# else:
# filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
# [cat.lower() for cat in CategoriesSelection])
filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
[cat.lower() for cat in CategoriesSelection])
if "MED" in CategoriesSelection:
filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) & (df_annotated_combined['IsBio'] == 1)
if "MISC" in CategoriesSelection:
# filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List]))
# filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) & ~(df_annotated_combined['IsBio'] == 1) # with this cluase, I'm including not only the categories labelled as MISC, but also the other that are not MED, PER, ORG, LOC
filter_mask |= ~(
df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) & ~(
df_annotated_combined[
'IsBio'] == 1) # with this cluase, I'm including not only the categories labelled as MISC, but also the other that are not MED, PER, ORG, LOC
df_annotated_combined = df_annotated_combined[filter_mask]
if df_annotated_combined.empty:
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
state["html_output"] = html_output
return {"text": text, "entities": []}, html_output, state, [], ""
###
#df_annotated_combined = is_cross_inside(df_annotated_combined, args)
if 'IsCrossInside' in df_annotated_combined.columns:
df_annotated_combined = df_annotated_combined[df_annotated_combined['IsCrossInside'] != 1]
if df_annotated_combined.empty:
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
state["html_output"] = html_output
return {"text": text, "entities": []}, html_output, state, [], ""
dict_annotated_combined_NER = df_annotated_combined[["end", "entity_group", "score", "start", "word"]].to_dict(orient="records")
### continue linking part:
if strtobool(args.entity_linking):
# ##### this is to pass the links:
# # Create a new column for the entities with links
df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
# lambda row: (
# f"<a href='https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={row['namedEntity']}' target='_blank'>{row['word']}</a>"
# if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[
# 'word']
# ),
lambda row: (
f"<a href='https://api-vast.jrc.service.ec.europa.eu/describe//?url={row['namedEntity']}' target='_blank'>{row['word']}</a>"
if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[
'word']
),
axis=1
)
# Create a new dictionary with the entity information and the link
dict_annotated_combined_NEL = df_annotated_combined[
["end", "entity_group", "score", "start", "entity_with_link"]].to_dict(orient="records")
# Sort the entities by their start index
dict_annotated_combined_NEL.sort(key=lambda x: x['start'])
# Create a dictionary to map entity groups to colors
entity_colors = {
"MED": "#E6E6E6",
"PER": "#FFC0CB",
"ORG": "#C6F4D6",
"LOC": "#FFFFCC",
"MISC": "#F5DEB3"
}
text_with_links = text
offset = 0
for entity in dict_annotated_combined_NEL:
start = entity["start"] + offset
end = entity["end"] + offset
entity_text = entity["entity_with_link"]
text_with_links = text_with_links[:start] + entity_text + text_with_links[end:]
offset += len(entity_text) - (end - start)
# # Create the text with entities highlighted and linked
# text_with_links = text
# offset = 0
# for entity in dict_annotated_combined_NEL:
# start = entity["start"] + offset
# end = entity["end"] + offset
# entity_text = entity["entity_with_link"]
# entity_group = entity["entity_group"]
#
# color = entity_colors.get(entity_group, "#dbeafe") # Default
# darker_color = "#008080"
#
# if "https:" in entity_text:
# text_with_links = text_with_links[
# :start] + f'<span style="background-color: {color}; border-radius: 2px; padding: 2px 4px"><a style="color: {darker_color}" href="{entity_text.split(">")[1].split("<")[0]}">{entity_text.split(">")[1].split("<")[0]}</a> <span style="color: {darker_color}; font-size: 0.8em">{entity_group}</span></span>' + text_with_links[
# end:]
# offset += len(
# f'<span style="background-color: {color}; border-radius: 2px; padding: 2px 4px"><a style="color: {darker_color}" href="{entity_text.split(">")[1].split("<")[0]}">{entity_text.split(">")[1].split("<")[0]}</a> <span style="color: {darker_color}; font-size: 0.8em">{entity_group}</span></span>') - (
# end - start)
# # text_with_links = text_with_links[:start] + f'<span style="background-color: {color}"><a href="{entity_text.split(">")[1].split("<")[0]}">{entity_text.split(">")[1].split("<")[0]}</a></span>' + text_with_links[end:]
# # offset += len(
# # f'<span style="background-color: {color}"><a href="{entity_text.split(">")[1].split("<")[0]}">{entity_text.split(">")[1].split("<")[0]}</a></span>') - (
# # end - start)
# #
# # text_with_links = text_with_links[:start] + entity_text + text_with_links[end:]
# # offset += len(entity_text) - (end - start)
# else:
# text_with_links = text_with_links[
# :start] + f'<span style="background-color: {color}; border-radius: 2px; padding: 2px 4px">{entity_text} <span style="color: {darker_color}; font-size: 0.8em">{entity_group}</span></span>' + text_with_links[end:]
# offset += len(
# f'<span style="background-color: {color}; border-radius: 2px; padding: 2px 4px">{entity_text} <span style="color: {darker_color}; font-size: 0.8em">{entity_group}</span></span>') - (
# end - start)
# # text_with_links = text_with_links[
# # :start] + f'<span style="background-color: {color}">{entity_text}</span>' + text_with_links[
# # end:]
# # offset += len(
# # f'<span style="background-color: {color}">{entity_text}</span>') - (end - start)
# Update state with the DataFrame
state["df_annotated_combined_dict"] = df_annotated_combined.to_dict()
if 'ALLURIScontext' in df_annotated_combined.columns:
# words_for_dropdown = df_annotated_combined[
# df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [])][
# 'word'].unique().tolist()
words_for_dropdown = df_annotated_combined[df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [] and (isinstance(x, list) and len(x) > 0) and (isinstance(x, list) and (not (len(x) == 1 and not str(x[0]).strip())) ))]['word'].unique().tolist()
words_for_dropdown = list({entry.lower(): entry for entry in words_for_dropdown}.values())
words_for_dropdown.insert(0, "")
else:
words_for_dropdown = []
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text_with_links}</div>"
state["html_output"] = html_output
#return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state
return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state, gr.update(choices=words_for_dropdown), ""
else:
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
state["html_output"] = html_output
return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state, [], ""
else:
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
state["html_output"] = html_output
return {"text": text, "entities": []}, html_output, state, [], ""
# "FacebookAI/xlm-roberta-large-finetuned-conll03-english", "Babelscape/wikineural-multilingual-ner", "blaze999/Medical-NER", "urchade/gliner_large-v2.1", "urchade/gliner_large_bio-v0.1"
def update_urls(selected_word, state):
if "df_annotated_combined_dict" in state:
# Convert the state dictionary back into a DataFrame
df = pd.DataFrame(state["df_annotated_combined_dict"])
if 'ALLURIScontext' in df.columns:
# # Filter the DataFrame to get rows where 'ALLURIScontextFromNCBO' is not empty or None
# valid_entries = df[df['ALLURIScontext'].apply(lambda x: x is not None and x != [])]
# # Filter the DataFrame to get rows where 'ALLURIScontext' is not None, not an empty list, and not an empty string
valid_entries = df[df['ALLURIScontext'].apply(lambda x: x is not None and x != [] and (isinstance(x, list) and len(x) > 0) and (isinstance(x, list) and (not (len(x) == 1 and not str(x[0]).strip())) ))]
# Check if the selected word is in the filtered DataFrame
if selected_word in valid_entries['word'].values:
urls = valid_entries.loc[valid_entries['word'] == selected_word, 'ALLURIScontext'].values[0]
if 'namedEntity' in df.columns:
firsturlinlist = df.loc[df['word'] == selected_word, 'namedEntity']
firsturlinlist = firsturlinlist.iloc[0] if not firsturlinlist.empty else None
if firsturlinlist and firsturlinlist in urls:
# Remove the URL from its current position
urls.remove(firsturlinlist)
# Insert the URL at the first position
urls.insert(0, firsturlinlist)
# Convert list of URLs to HTML string with clickable links
#html_links = "<br>".join([f'<a href="https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={url}" target="_blank">{url}</a>' for url in urls])
html_links = "<br>".join([f'<a href="https://api-vast.jrc.service.ec.europa.eu/describe//?url={url}" target="_blank">{url}</a>' for url in urls])
return html_links
return ""
else:
return""
else:
return ""
# demo = gr.Interface(
# fn=nerBio,
# inputs=[
# gr.Textbox(label= "Input text", placeholder="Enter text here..."),
# gr.CheckboxGroup(models_List, label="ModelsSelection", value=models_List),
# gr.CheckboxGroup(categories_List, label="CategoriesSelection", value=categories_List),
# gr.Slider(minimum=0, maximum=1.0, step=0.1, label="Score", value=0.7),
# gr.Checkbox(label="Enable Named-Entity Linking (NEL)", value=False), #True False
# #gr.CheckboxGroup(POSSIBLE_KGchoices_List, label="KGchoices Selection", value=POSSIBLE_KGchoices_List, visible=True),
# gr.Dropdown(POSSIBLE_KGchoices_List, multiselect=True, label="KGchoices Selection", value=POSSIBLE_KGchoices_List),
# gr.State(value={})
# ],
# outputs=[
# gr.HighlightedText(label="Annotated Text"),
# gr.HTML(label="Linked Text", show_label=True, visible=True), # use gr.HTML to render the annotated text with links , visible
# gr.State(),
# gr.Dropdown(label="Annotated Concepts", interactive=True,visible=True),
# gr.Textbox(label="Linked Entities",interactive=False,visible=True)
# ],
# live=True,
# title="BioAnnotator: Biomedical Named-Entity Recognition (NER) and Linking (NEL)",
# description="""Interoperability β the capability of systems and organisations to cooperate across functional, sectoral and physical borders β is key for successful digital transformation.
# The [Interoperable Europe Act](https://interoperable-europe.ec.europa.eu/interoperable-europe/interoperable-europe-act) is an EU regulation that aims to strengthen public sector interoperability and will serve as a main EC policy framework for the years to come.
# Data exchange is vital for digital government policies, and semantic interoperability ensures systems understand each other despite different legacies and architectures.
#
# In this demo we show in particular the *BioAnnotator*, a prototype tool performing Biomedical Named-Entity Recognition (NER) and Linking (NEL). To give it a try, please select one or more NER models and enter some text to get it processed. Please select also the entity categories you want to extract, as well as the score to use as a threshold for the NER extraction. Finally, select whether you want to perform Named-Entity Linking (NEL) and if you want to enable the filtering to some specific biomedical ontologies only (acronyms description at: https://bioportal.bioontology.org/ontologies). See also: [InventoryHealthKGs.pdf](https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/ETOHA/KGs/InventoryHealthKGs.pdf).
# """,
# examples=examples,
# cache_examples=False,
# article="""
# **Categories Legend:**
# - MED | Medical
# - LOC | Locations
# - PER | Persons
# - ORG | Organizations
# - MISC | Miscellanea
# - CONC | Concepts & Ideas
# - BIOP | Biological
# - ACTI | Activities & Behaviors
# - ANAT | Anatomy
# - CHEM | Chemicals & Drugs
# - DEVI | Devices
# - DISO | Disorders
# - GENE | Genes & Molecular Sequences
# - GEOG | Geographic Areas
# - LIVB | Living Beings
# - OBJC | Objects
# - OCCU | Occupations
# - ORGA | Organizations
# - PHEN | Phenomena
# - PHYS | Physiology
# - PROC | Procedures
# """
# )
# Define the Gradio interface using Blocks
#description="This application performs biomedical named-entity recognition and linking."
with gr.Blocks(title="BioAnnotator") as demo:
gr.Markdown("# BioAnnotator: Biomedical Named-Entity Recognition (NER) and Linking (NEL)")
gr.Markdown("""
This application performs biomedical named-entity recognition and linking.
**Description:**
*Interoperability* β the capability of systems and organisations to cooperate across functional, sectoral and physical borders β is key for successful digital transformation.
The [Interoperable Europe Act](https://interoperable-europe.ec.europa.eu/interoperable-europe/interoperable-europe-act) is an EU regulation that aims to strengthen public sector interoperability and will serve as a main EC policy framework for the years to come.
Data exchange is vital for digital government policies, and semantic interoperability ensures systems understand each other despite different legacies and architectures.
In this demo we show in particular the *BioAnnotator*, a prototype tool performing Biomedical Named-Entity Recognition (NER) and Linking (NEL). To give it a try, please select one or more NER models and enter some text to get it processed. Please select also the entity categories you want to extract, as well as the score to use as a threshold for the NER extraction. Finally, select whether you want to perform Named-Entity Linking (NEL) and if you want to enable the filtering to some specific biomedical ontologies only (acronyms description at: https://bioportal.bioontology.org/ontologies). See also: [InventoryHealthKGs.pdf](https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/ETOHA/KGs/InventoryHealthKGs.pdf).
""")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(label="Input text", placeholder="Enter text here...")
models_selection = gr.CheckboxGroup(models_List, label="ModelsSelection", value=models_List)
categories_selection = gr.CheckboxGroup(categories_List, label="CategoriesSelection", value=categories_List)
score_slider = gr.Slider(minimum=0, maximum=1.0, step=0.05, label="Score", value=0.75)
nel_checkbox = gr.Checkbox(label="Enable Named-Entity Linking (NEL)", value=False)
kgchoices_selection = gr.Dropdown(POSSIBLE_KGchoices_List, multiselect=True, label="KGchoices Selection", value=POSSIBLE_KGchoices_List)
state = gr.State(value={})
with gr.Column():
annotated_text = gr.HighlightedText(label="Annotated Text")
linked_text = gr.HTML(label="Linked Text", show_label=True, visible=True)
word_dropdown = gr.Dropdown(label="Annotated Concepts", show_label=True, visible=True, interactive=True)
urls_html = gr.HTML(label="Linked Entities", show_label=True, visible=True)
## Define the interactions
#text_input.change(fn=nerBio, inputs=[text_input, models_selection, categories_selection, score_slider, nel_checkbox, kgchoices_selection, state], outputs=[annotated_text, linked_text, state, word_dropdown, urls_html])
# Define the interactions for all inputs
inputs = [text_input, models_selection, categories_selection, score_slider, nel_checkbox, kgchoices_selection]
for input_component in inputs:
input_component.change(fn=nerBio,
inputs=[text_input, models_selection, categories_selection, score_slider, nel_checkbox,
kgchoices_selection, state],
outputs=[annotated_text, linked_text, state, word_dropdown, urls_html])
word_dropdown.change(fn=update_urls, inputs=[word_dropdown, state], outputs=urls_html)
# Add examples
gr.Examples(examples=examples,
inputs=[text_input, models_selection, categories_selection, score_slider, nel_checkbox,
kgchoices_selection])
gr.Markdown("""
**Categories Legend:**
- MED | Medical
- LOC | Locations
- PER | Persons
- ORG | Organizations
- MISC | Miscellanea
- CONC | Concepts & Ideas
- BIOP | Biological
- ACTI | Activities & Behaviors
- ANAT | Anatomy
- CHEM | Chemicals & Drugs
- DEVI | Devices
- DISO | Disorders
- GENE | Genes & Molecular Sequences
- GEOG | Geographic Areas
- LIVB | Living Beings
- OBJC | Objects
- OCCU | Occupations
- ORGA | Organizations
- PHEN | Phenomena
- PHYS | Physiology
- PROC | Procedures
""")
demo.launch()
#demo.launch(share=True) # Share your demo with just 1 extra parameter
|