File size: 53,424 Bytes
232b620
 
66b8c66
 
 
 
 
232b620
 
5a9842d
232b620
 
 
 
 
 
 
 
 
 
 
5a9842d
232b620
 
 
 
5a9842d
232b620
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a9842d
 
 
 
 
 
 
 
232b620
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66b8c66
 
232b620
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193f79d
 
232b620
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eff93c6
 
232b620
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66b8c66
 
 
 
 
 
232b620
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66b8c66
232b620
 
 
 
 
 
 
 
 
 
 
 
66b8c66
 
232b620
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66b8c66
232b620
66b8c66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232b620
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66b8c66
 
 
 
 
 
232b620
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66b8c66
 
 
 
 
 
232b620
 
 
 
 
 
 
 
 
 
 
66b8c66
232b620
 
 
 
 
 
 
66b8c66
232b620
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a9842d
232b620
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a9842d
232b620
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66b8c66
232b620
 
 
 
 
 
 
 
 
 
66b8c66
232b620
 
 
 
 
 
 
 
 
 
66b8c66
 
 
 
 
232b620
66b8c66
 
 
232b620
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66b8c66
232b620
 
 
 
 
 
66b8c66
232b620
 
 
 
 
66b8c66
232b620
 
 
 
 
 
 
 
 
 
 
66b8c66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232b620
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66b8c66
232b620
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66b8c66
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
import os

#os.environ["CUDA_VISIBLE_DEVICES"] = "1,6" # to use the GPUs 3,4 only

#os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
#os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
#os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"

from transformers import file_utils
print(file_utils.default_cache_path)

import pandas as pd
from tqdm import tqdm
from gliner import GLiNER
import logging
from jinja2 import Template
from collections import Counter

from transformers import pipeline, AutoTokenizer

#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

#import html

import torch
torch.cuda.empty_cache()  # Clear cache ot torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}...")
if device.type == "cuda":
    print("GPU number:", torch.cuda.current_device())

import datasets

import argparse
import json
import random
import numpy as np

import tiktoken
from langchain.text_splitter import TokenTextSplitter

import gradio as gr
import re
from common import strtobool, token_counter, encoding_getter, strip_quotes
from nerBio import annotate, entitiesFusion, is_cross_inside, elinking
from llmqueryNer import call_model, call_model_with_caching, process_list, setup_gptjrc, api_call_gptjrc, model_list_gptjrc


from joblib import Memory

cachedir = 'cached'
mem = Memory(cachedir, verbose=False)

# this is to completely delete the cache:
# mem.clear(warn=False)





examples = [
["He said the disease was 1st detected to have spread to Malaysia from Africa in 1997. " , None],
["He said the disease which was 1st detected in Johor had spread to Negeri Sembilan, Melaka, Perak, Selangor and the latest Kedah.  He said the disease was 1st detected to have spread to Malaysia from Africa in 1997. ", None],
["The Health Ministry has detected about 4000 suspected chikungunya cases nationwide this year [2008], Minister Datuk Liow Tiong Lai said Wednesday [17 Dec 2008]. ", None],
["The Health Ministry has detected about 4000 suspected chikungunya cases nationwide this year [2008], Minister Datuk Liow Tiong Lai said Wednesday [17 Dec 2008]. He said the disease which was 1st detected in Johor had spread to Negeri Sembilan, Melaka, Perak, Selangor and the latest Kedah. \"So far, the chikungunya disease is still under control nationwide,\" he told reporters after visiting Sultanah Nur Zahirah Hospital here. Present was Terengganu Health Director Dr. Nordiyanah Hassan. Liow said that so far, there is no specific medicine to treat the chikungunya fever disease spread by _Aedes_ mosquito. \"So, I would like to call on the public to be careful particularly during the wet season now because _Aedes_ mosquito is easy to breed,\" he said. To contain the spread of the disease, he said, the ministry had taken several measures including intensifying the campaign to rid of _Aedes_ mosquito and holding lectures on the outbreak. He said the disease was 1st detected to have spread to Malaysia from Africa in 1997. Meanwhile, he said 63 health projects costing RM458 million [USD 131 230 211] had been approved for implementation in Terengganu under the Ninth Malaysia Plan and some had started.", None],
["Carcinoma", None],
["The doctor diagnosed the patient with basal cell carcinoma, a common type of skin cancer.", None],
["West Nile virus", None],
["Legionellosis", None],
["Eight years ago I started with Fosamax for 3-4 years and then took Actonel. In March, I decided not to take Actonel any longer. I had been on it for too long and was fearful of esophageal cancer and bone breakage. Now my doctor wants me to take the Prolia injections, which I am not going to do. I am not going to continue with any drugs. My bone density recently done was in the minuses. I do work with a personal trainer and execise daily. I am searching for alternative ways to deal with this problem.", None],
["Does Chicago have any stores and does Joe live here?", None],
["Cholera has been reported every week since November 1994. By 5 November 1995 at total of 12,344 with 245 deaths have been notified. Of these, 879 cases with 4 deaths were reported for the period 9 October to 5 November 1995. Control efforts have not succeeded in preventing the spread of the epidemic and when cases were detected on Sao Nicolau and Sal Islands in the period 9 October to 5 November all nine inhabited islands of Cap Verde had become infected. The last cholera epidemic in Cap Verde occurred in 1979. (See also Weekly Epidemiological Record No. 44, 3 November 1995) CΓ΄te d'Ivoire: A cholera outbreak which started in September 1995 caused 2,027 cases and 150 deaths up to 12 November 1995. The first cases were reported in Department de l'Ouest on 18 September 1995. Cases were subsequently reported in Department de Nord and most recently in Department du Centre and Department de Sud. The WHO Representative assisted in the organization of a team to visit the area and evaluate the situation as well as arranging for medical supplies. (1.12.95) Iran, Islamic Republic of,: Kordestan Province has been declared free of cholera. (1.12.95) Iraq: An outbreak of cholera reported from Sulaimaniyah Governorate in Northern Iraq has resulted in 519 cases, 264 of which have been confirmed, and 3 deaths to date. Vibrio cholerae O1 serotype Ogawa has been isolated. At the request of the Iraqi Ministry of Health, a WHO consultant has been sent to the area to assess and monitor the situation, provide guidance to the health authorities, and coordinate inputs by non-governmental organizations. WHO has also made available essential treatment supplies. An intensive media campaign to raise public awareness about essential preventive measures has been successful in containing the spread of the outbreak. (1.12.95) Senegal: Despite the fact that cholera has been endemic in countries bordering Senegal for the past two years, no cases were reported from Senegal until mid- August 1995. Between 15 August and 17 November 1995, 852 case and 43 deaths were notified. A further 731 cases with 37 deaths have been reported for the period 1 September to 12 November. Most cases were in the Departments of Dakar and Pikine in the Dakar Region and recently also Departments of Mbacke and Touba in Diourbel Region. ", None],
]



models_List = ["FacebookAI/xlm-roberta-large-finetuned-conll03-english",  "Babelscape/wikineural-multilingual-ner",  "blaze999/Medical-NER", "urchade/gliner_large-v2.1", "urchade/gliner_large_bio-v0.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1",  "knowledgator/gliner-multitask-large-v0.5"
#models_List = ["Babelscape/wikineural-multilingual-ner",  "urchade/gliner_large-v2.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1",  "knowledgator/gliner-multitask-large-v0.5"
#models_List = ["NCBO/BioPortal" ]

#categories_List = ["MED","LOC","PER","ORG","DATE","MISC"]
categories_List = ["MED","LOC","PER","ORG","DATE","MISC", "CONC", "BIOP", "ACTI", "ANAT", "CHEM",  "DEVI", "DISO", "GENE", "GEOG", "LIVB", "OBJC", "OCCU", "ORGA", "PHEN", "PHYS" , "PROC"]

POSSIBLE_KGchoices_List = ["AI", "AIO", "AEO", "BFO", "BIM", "BCGO", "CL", "CHIRO", "CHEBI", "DCM", "FMA", "GO", "GENO",
             "GeoSPARQL", "HL7", "DOID", "HP", "HP_O", "IDO", "IAO", "ICD10", "LOINC", "MESH",
             "MONDO", "NCIT", "NCBITAXON", "NCBITaxon_", "NIFCELL", "NIFSTD", "GML", "OBCS", "OCHV", "OHPI",
             "OPB", "TRANS", "PLOSTHES", "RADLEX", "RO", "STY", "SO", "SNOMED", "STATO",
             "SYMP", "FoodOn", "UBERON", "ORDO", "HOOM", "VO", "OGMS", "EuroSciVoc"]


modelGliner=None
modelGlinerBio=None

num_cores_Gliner_forDemo = 0  # 0 means use the GPU for Gliner !
tokenizerGliner = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')


encod = encoding_getter('microsoft/deberta-v3-large')
text_splitter = TokenTextSplitter(
    # separators=separators,
    encoding_name=encod.name,
    chunk_size=80000,
    chunk_overlap=50,
    length_function=len,
    add_start_index=True,
)

pipe_dict = {}
for modelName in models_List:
    tsk = "token-classification"
    if (("/gliner" in modelName) == False) and (("NCBO" in modelName) == False):
        pipe = pipeline(
            tsk,
            model=modelName,
            aggregation_strategy="simple",
            device=device,
        )
        pipe_dict[modelName] = pipe
    elif ("/gliner" in modelName):
        if not tokenizerGliner:
            tokenizerGliner = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')
        if "_bio-" in modelName:
            if num_cores_Gliner_forDemo > 0:
                modelGlinerBio = GLiNER.from_pretrained(modelName)  # "urchade/gliner_large_bio-v0.1")
            else:
                modelGlinerBio = GLiNER.from_pretrained(modelName, map_location=device)
        else:
            if num_cores_Gliner_forDemo > 0:
                modelGliner = GLiNER.from_pretrained(
                    modelName)  # "knowledgator/gliner-multitask-large-v0.5" - "urchade/gliner_large-v2.1"
            else:
                modelGliner = GLiNER.from_pretrained(modelName, map_location=device)


#### GPT@JRC API
#if args.service_provider == "gptjrc":
key_gptjrc = ""
fkeyname = "GPTJRC-APItoken.key"
if os.path.exists(fkeyname):
    with open(fkeyname) as f:
        key_gptjrc = f.read()
else:
    key_gptjrc = os.environ['key_gptjrc']
if key_gptjrc and key_gptjrc != "":
    setup_gptjrc(key_gptjrc)
#####


# Add this function to handle dropdown selection
def get_urls(word, df_annotated_combined):
    # Filter the DataFrame to get rows where 'ALLURIScontext' is not empty or None
    #valid_entries = df_annotated_combined[df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [])]
    valid_entries = df_annotated_combined[df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [] and (
                isinstance(x, list) and (isinstance(x, list) and len(x) > 0) and (not (len(x) == 1 and not str(x[0]).strip()))))]

    # Check if the word is in the filtered DataFrame
    if word in valid_entries['word'].values:
        urls = valid_entries.loc[valid_entries['word'] == word, 'ALLURIScontext'].values[0]

        if 'namedEntity' in df_annotated_combined.columns:
            firsturlinlist = df_annotated_combined.loc[df_annotated_combined['word'] == word, 'namedEntity']
            firsturlinlist = firsturlinlist.iloc[0] if not firsturlinlist.empty else None
            if firsturlinlist and firsturlinlist in urls:
                # Remove the URL from its current position
                urls.remove(firsturlinlist)
                # Insert the URL at the first position
                urls.insert(0, firsturlinlist)

        #html_links = "<br>".join([f'<a href="https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={url}" target="_blank">{url}</a>' for url in urls])
        html_links = "<br>".join([f'<a href="https://api-vast.jrc.service.ec.europa.eu/describe//?url={url}" target="_blank">{url}</a>' for url in urls])
        return html_links
    return ""




###@mem.cache
def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking, KGchoices, state: dict):

    if EntityLinking:
        EnableNEL="True"
    else:
        EnableNEL="False"

    if not text:
        html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
        state = {
            "text": "",
            "df_annotated_dict": dict(),
            "df_annotated_combined_dict": dict(),
            "KGchoices": KGchoices,
            "ModelsSelection": ModelsSelection,
            "ScoreFilt": ScoreFilt,
            "EntityLinking": EntityLinking,
            "html_output": html_output
        }
        return {"text": text, "entities": []}, html_output, state, [], ""

    df_annotated = pd.DataFrame()

    parser = argparse.ArgumentParser()

    parser.add_argument("--model_id", type=str, default=models_List[0], help="model to use")

    parser.add_argument("--debug", type=str, default="True", help="set debug mode")

    parser.add_argument("--source_column", type=str, default="ContextToAnnotate")

    parser.add_argument("--entities_filter_threshold", type=int, default=ScoreFilt)

    parser.add_argument("--SEED", type=int, default=41)
    parser.add_argument("--batch_size", type=int, default=32)  # 4 - 8 - 16
    parser.add_argument("--num_cores_Gliner", type=int, default=num_cores_Gliner_forDemo, help="parallel processing for Gliner annotation")  # 0 means use the GPU for Gliner !

    parser.add_argument("--entity_linking", type=str, default=EnableNEL, help="whether to make entities linking or not")
    parser.add_argument("--geonameskey_filename", type=str, default="GEONAMES-API.key", help="file location where it is stored the geonames api key")
    parser.add_argument("--virtuosokey_filename", type=str, default="VIRTUOSO-dba.key", help="file location where it is stored the virtuoso endpoint dba pwd")
    parser.add_argument("--bioportalkey_filename", type=str, default="NCBO-BioPortal.key", help="file location where it is stored the NCBO  BioPortal api key")

    # consose 20250205:
    # KGchoices = None
    # KGchoices = ['SNOMED', 'LOINC', 'ICD10', 'NCIT']
    # KGchoices = ['SNOMED', 'LOINC', 'ICD10', 'MESH', 'NCIT']  # restricts the input to these values only
    if KGchoices:
        KGchoices.sort()
    parser.add_argument("--KG_restriction", nargs='+', choices=KGchoices, default=KGchoices,
                        help="List of ontologies to which restrict the entity linking task.")
    #consose 20250502:
    if Counter(KGchoices) == Counter(POSSIBLE_KGchoices_List):
        parser.add_argument("--USE_CACHE", type=str, default="True",
                            help="whether to use cache for the NER and NEL tasks or not")
    else:
        #print("Lists do not have the same elements")
        parser.add_argument("--USE_CACHE", type=str, default="False",
                            help="whether to use cache for the NER and NEL tasks or not")

    parser.add_argument("--num_cores_eLinking", type=int, default=10, help="parallel processing for the entity linking process")

    parser.add_argument("--computeEntityContext", type=str, default="False",
                        help="whether to extract a readable context from the extracted triples for the concept")
    parser.add_argument("--computeEntityGlobalContext", type=str, default="False",
                        help="whether to extract a readable context from the extracted triples of all the entities extracted from the endpoint for the concept")
    parser.add_argument("--maxTriplesGlobalContext", type=int, default=20000,
                        help="maximum number of triples to consider for global context computation")  # if 0 or None it is not considered
    parser.add_argument("--UseRetrieverForContextCreation", type=str, default="True",
                        help="whether to use a retriever for the creation of the context of the entities from the triples coming from the KGs")

    parser.add_argument("--service_provider", type=str, default="gptjrc", help="llm service provider")
    parser.add_argument("--model_name", type=str, default="llama-3.1-70b-instruct", help="llm to use")
    parser.add_argument("--tokens_max", type=int, default=80000, help="max number of tokens to supply to the llm")

    parser.add_argument("--temperature", type=int, default=0.01)


    args = parser.parse_args()

    df_ToAnnotate = pd.DataFrame()

    previous_text = ""
    previous_df_annotated_dict = dict()
    previous_kg_choices = []
    if state:
        previous_text = state.get("text", "")
        previous_df_annotated_dict = state.get("df_annotated_dict", {})
        previous_df_annotated_combined_dict = state.get("df_annotated_combined_dict", {})
        previous_kg_choices = state.get("KGchoices", [])
        previous_ModelsSelection = state.get("ModelsSelection", [])
        previous_ScoreFilt_from_state = float(state.get("ScoreFilt", ScoreFilt))  # Ensure ScoreFilt is a float
        previous_EntityLinking_from_state = bool(state.get("EntityLinking", EntityLinking))  # Ensure EntityLinking is a boolean
        previous_html_output = state.get("html_output", "")


        if  previous_html_output and (previous_df_annotated_dict) and (previous_df_annotated_combined_dict) and (previous_text == text) and (sorted(previous_kg_choices) == sorted(KGchoices)) and (sorted(previous_ModelsSelection) == sorted(ModelsSelection)) and (previous_ScoreFilt_from_state == ScoreFilt) and (previous_EntityLinking_from_state == EntityLinking):
            ddf_annot_prev = pd.DataFrame(previous_df_annotated_combined_dict)
            if 'ALLURIScontext' in ddf_annot_prev.columns:
                # words_for_dropdown = df_annotated_combined[
                #     df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [])][
                #     'word'].unique().tolist()
                words_for_dropdown = ddf_annot_prev[ddf_annot_prev['ALLURIScontext'].apply(
                    lambda x: x is not None and x != [] and (isinstance(x, list) and len(x) > 0) and (
                                isinstance(x, list) and (not (len(x) == 1 and not str(x[0]).strip()))))][
                    'word'].unique().tolist()
                words_for_dropdown = list({entry.lower(): entry for entry in words_for_dropdown}.values())
                words_for_dropdown.insert(0, "")
            else:
                words_for_dropdown = []

            dict_annotated_combined_NER = ddf_annot_prev[
                ["end", "entity_group", "score", "start", "word"]].to_dict(orient="records")

            # return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state
            return {"text": text, "entities": dict_annotated_combined_NER}, previous_html_output, state, gr.update(
                choices=words_for_dropdown), ""





    #print("Are all models in any row of the 'model' column, case-insensitively?", all_models_in_any_row)
    #if (not history_dict) or (history_dict[args.source_column][0] != text) or (all_models_in_any_row == False):
    #if (not history_dict) or (history_dict[args.source_column][0] != text):
    if (not previous_df_annotated_dict) or (previous_text != text) or (sorted(previous_kg_choices) != sorted(KGchoices) ):

        for model_id in models_List:  # always do all the annotations, only filter them afterwards
        #for model_id in ModelsSelection:

            # if history_dict and (history_dict[args.source_column][0] == text):
            #     if model_id in hhist['model'].unique():
            #         continue

            parser.set_defaults(model_id=model_id)

            args = parser.parse_args()

            print("ARGS:")
            print(args)

            # %% n machine learning tasks, particularly when dealing with models that have stochasticity involved (like text generation), it's important to set seeds for random number generators to ensure reproducibility of results. In the case of using models from the transformers library, you need to set seeds for both Python's random module, NumPy, and PyTorch to ensure that the results are the same every time you run the code.
            # Before you create the pipeline and run the text generation, set the seeds like this:
            random.seed(args.SEED)
            np.random.seed(args.SEED)
            torch.manual_seed(args.SEED)
            torch.cuda.manual_seed_all(args.SEED)
            ###

            df_ToAnnotate = pd.DataFrame({ "ToLink": [None], args.source_column: [text]})

            if "SentenceRef" not in df_ToAnnotate.columns:
                df_ToAnnotate["SentenceRef"] = None
                df_ToAnnotate = df_ToAnnotate[['SentenceRef'] + [col for col in df_ToAnnotate.columns if
                                                                 col != 'SentenceRef']]  # this moves it to the first position

            df_ToAnnotate['SentenceRef'] = df_ToAnnotate.index + 1
            df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].argsort().groupby(df_ToAnnotate[args.source_column]).transform('min').astype(int)
            df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].rank(method='dense').astype(int)

            # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            # if strtobool(args.debug):
            #     print(f"Device: {device}...")
            #     if device.type == "cuda":
            #         print("GPU number:", torch.cuda.current_device())

            pipeToUse = None
            if (("gliner" in args.model_id) == False) and (("NCBO" in args.model_id)== False) :
                pipeToUse = pipe_dict[args.model_id]

            new_annotations = annotate(df_ToAnnotate, args, pipeToUse, tokenizerGliner, modelGliner, modelGlinerBio, device)
            if not new_annotations.empty:
                if df_annotated.empty:
                    # If df_annotated is empty, just assign new_annotations to it
                    df_annotated = new_annotations
                else:
                    # If df_annotated is not empty, concatenate new_annotations to it
                    df_annotated = pd.concat([df_annotated, new_annotations], ignore_index=True)

        state = {
            "text": text,
            "df_annotated_dict": df_annotated.to_dict(),
            "df_annotated_combined_dict": dict(),
            "KGchoices": KGchoices,
            "ModelsSelection": ModelsSelection,
            "ScoreFilt": ScoreFilt,
            "EntityLinking": EntityLinking,
            "html_output": ""
        }

    else:

        print("ARGS:")
        print(args)

        # %% n machine learning tasks, particularly when dealing with models that have stochasticity involved (like text generation), it's important to set seeds for random number generators to ensure reproducibility of results. In the case of using models from the transformers library, you need to set seeds for both Python's random module, NumPy, and PyTorch to ensure that the results are the same every time you run the code.
        # Before you create the pipeline and run the text generation, set the seeds like this:
        random.seed(args.SEED)
        np.random.seed(args.SEED)
        torch.manual_seed(args.SEED)
        torch.cuda.manual_seed_all(args.SEED)
        ###

        history = pd.DataFrame(previous_df_annotated_dict)
        df_annotated = history.copy()

        state = {
            "text": text,
            "df_annotated_dict": df_annotated.to_dict(),
            "df_annotated_combined_dict": dict(),
            "KGchoices": KGchoices,
            "ModelsSelection": ModelsSelection,
            "ScoreFilt": ScoreFilt,
            "EntityLinking": EntityLinking,
            "html_output": ""
        }


    quoted_text = text.startswith('"') & text.endswith('"')
    if (not df_annotated.empty) or quoted_text:

        if (not df_annotated.empty):
            # filter now per models selection
            df_annotated = df_annotated[df_annotated['model'].str.lower().isin([model.lower() for model in ModelsSelection])]
            if df_annotated.empty and quoted_text==False:
                html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
                state["html_output"] = html_output
                return {"text": text, "entities": []}, html_output, state, [], ""

        df_annotated_combined = pd.DataFrame()
        if (not df_annotated.empty):
            df_annotated_combined = entitiesFusion(df_annotated,args)
            if df_annotated_combined.empty and quoted_text==False:
                html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
                state["html_output"] = html_output
                return {"text": text, "entities": []}, html_output, state, [], ""
            else:
                if (not df_annotated.empty):
                    df_annotated_combined = is_cross_inside(df_annotated_combined, args, 0.999)  #I cut all the cross inside with the 0.99. to avoid the linking


        cache_prefix_fp = "LLMQUERYNER"
        cache_nameLLMs = cache_prefix_fp + "___" + "__".join(
            [args.service_provider, args.model_name, str(args.temperature)]).replace(
            " ", "_") + ".json"

        load_map_query_input_output = None
        if strtobool(args.USE_CACHE):
            if os.path.exists(cache_nameLLMs):
                with open(cache_nameLLMs) as f:
                    load_map_query_input_output = json.load(f)
            else:
                load_map_query_input_output = {}

        ### entity linking part:
        if strtobool(args.entity_linking):

            cache_map_geonames = None
            if strtobool(args.USE_CACHE):
                cache_filename = "CACHE_geonames.json"
                if os.path.exists(cache_filename):
                    with open(cache_filename) as f:
                        cache_map_geonames = json.load(f)
                else:
                    cache_map_geonames = {}

            key_geonames = ""
            if args.geonameskey_filename and os.path.exists(args.geonameskey_filename):
                fkeyname = args.geonameskey_filename
                with open(fkeyname) as f:
                    key_geonames = f.read()
            else:
                key_geonames = os.environ['key_geonames']

            cache_map_virtuoso = None
            if strtobool(args.USE_CACHE):
                cacheVirtuoso_filename = "CACHE_virtuoso.json"
                if os.path.exists(cacheVirtuoso_filename):
                    with open(cacheVirtuoso_filename) as f:
                        cache_map_virtuoso = json.load(f)
                else:
                    cache_map_virtuoso = {}

            key_virtuoso = ""
            if args.virtuosokey_filename and os.path.exists(args.virtuosokey_filename):
                fkeyname = args.virtuosokey_filename
                with open(fkeyname) as f:
                    key_virtuoso = f.read()
            else:
                key_virtuoso = os.environ['key_virtuoso']


            # Here for the EXACT MATCHING "" - if the desired term has not been identified in the NER, add to the dataframe:

            if df_ToAnnotate.empty:
                df_ToAnnotate = pd.DataFrame({"ToLink": [None], args.source_column: [text]})

                if "SentenceRef" not in df_ToAnnotate.columns:
                    df_ToAnnotate["SentenceRef"] = None
                    df_ToAnnotate = df_ToAnnotate[['SentenceRef'] + [col for col in df_ToAnnotate.columns if
                                                                     col != 'SentenceRef']]  # this moves it to the first position

                df_ToAnnotate['SentenceRef'] = df_ToAnnotate.index + 1
                df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].argsort().groupby(
                    df_ToAnnotate[args.source_column]).transform('min').astype(int)
                df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].rank(method='dense').astype(int)

            # Define the condition to find missing SentenceRefs
            missing_sentence_refs = ~df_ToAnnotate['SentenceRef'].isin(df_annotated_combined['SentenceRef'])

            # Define the condition to check if ContextToAnnotate starts and ends with quotes
            quoted_context = df_ToAnnotate[args.source_column].str.startswith('"') & df_ToAnnotate[
                args.source_column].str.endswith('"')

            # Combine both conditions
            condition = missing_sentence_refs & quoted_context

            # Select rows from df_ToAnnotate that meet the condition
            rows_to_add = df_ToAnnotate[condition]

            rows_to_add['model'] = "Forced"
            rows_to_add['entity_group'] = "MISC"
            rows_to_add['word'] = rows_to_add[args.source_column]
            rows_to_add['word'] = rows_to_add[args.source_column].apply(strip_quotes)
            rows_to_add['score'] = 1.0
            rows_to_add['start'] = int(1)
            rows_to_add['end'] = rows_to_add['word'].apply(len) + int(1)
            rows_to_add['IsGeo'] = None
            rows_to_add['IsBio'] = None
            rows_to_add['IsCrossInside'] = 0.0

            if df_annotated_combined.empty:
                df_annotated_combined = pd.DataFrame(columns=df_ToAnnotate.columns)

            # Append these rows to df_annotated_combined
            df_annotated_combined = pd.concat([df_annotated_combined, rows_to_add], ignore_index=True)

            df_annotated_combined['start'] = df_annotated_combined['start'].astype(int)
            df_annotated_combined['end'] = df_annotated_combined['end'].astype(int)

            df_annotated_combined = df_annotated_combined.sort_values(
                by=['SentenceRef', 'start', 'ToLink', 'word', 'score'],
                ascending=[True, True, True, True, False])

            # Now df_annotated_combined contains the additional rows

            df_annotated_combined, cache_map_geonames_AFTER, cache_map_virtuoso_AFTER, load_map_query_input_output_AFTER = elinking(df_annotated_combined,
                                                                                                                                 text_splitter, args, key_geonames,
                                                                                                                                 cache_map_geonames,
                                                                                                                                 key_virtuoso,
                                                                                                                                 cache_map_virtuoso,
                                                                                                                                    load_map_query_input_output,
                                                                                                                                    device)

            if strtobool(args.USE_CACHE):
                if cache_map_geonames_AFTER is not None:
                    with open(cache_filename, "w") as f:
                        json.dump(cache_map_geonames_AFTER, f)

                if cache_map_virtuoso_AFTER is not None:
                    with open(cacheVirtuoso_filename, "w") as f:
                        json.dump(cache_map_virtuoso_AFTER, f)

                if load_map_query_input_output_AFTER is not None:
                    with open(cache_nameLLMs, "w") as f:
                        json.dump(load_map_query_input_output_AFTER, f)

            ### end entity linking part


        ### filter by selected category only
        # #df_annotated_combined = df_annotated_combined[df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in CategoriesSelection])]
        # if "MED" in CategoriesSelection:
        #     filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
        #         [cat.lower() for cat in CategoriesSelection]) | (df_annotated_combined['IsBio'] == 1)
        # else:
        #     filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
        #         [cat.lower() for cat in CategoriesSelection])
        # df_annotated_combined = df_annotated_combined[filter_mask]
        #
        # if "MED" in CategoriesSelection:
        #     filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
        #         [cat.lower() for cat in CategoriesSelection]) | (df_annotated_combined['IsBio'] == 1)
        # elif "OTHER" in CategoriesSelection:
        #     filter_mask = ~(
        #         df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List]))
        # else:
        #     filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
        #         [cat.lower() for cat in CategoriesSelection])

        filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
            [cat.lower() for cat in CategoriesSelection])
        if "MED" in CategoriesSelection:
            filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) & (df_annotated_combined['IsBio'] == 1)
        if "MISC" in CategoriesSelection:
            # filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List]))
            # filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) & ~(df_annotated_combined['IsBio'] == 1)  # with this cluase, I'm including not only the categories labelled as MISC, but also the other that are not MED, PER, ORG, LOC
            filter_mask |= ~(
                df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) & ~(
                        df_annotated_combined[
                            'IsBio'] == 1)  # with this cluase, I'm including not only the categories labelled as MISC, but also the other that are not MED, PER, ORG, LOC

        df_annotated_combined = df_annotated_combined[filter_mask]
        if df_annotated_combined.empty:
            html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
            state["html_output"] = html_output
            return {"text": text, "entities": []}, html_output, state, [], ""

        ###

        #df_annotated_combined = is_cross_inside(df_annotated_combined, args)

        if 'IsCrossInside' in df_annotated_combined.columns:
            df_annotated_combined = df_annotated_combined[df_annotated_combined['IsCrossInside'] != 1]
            if df_annotated_combined.empty:
                html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
                state["html_output"] = html_output
                return {"text": text, "entities": []}, html_output, state, [], ""

        dict_annotated_combined_NER = df_annotated_combined[["end", "entity_group", "score", "start", "word"]].to_dict(orient="records")

        ### continue linking part:
        if strtobool(args.entity_linking):
            # ##### this is to pass the links:

            # # Create a new column for the entities with links
            df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
                # lambda row: (
                #     f"<a href='https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={row['namedEntity']}' target='_blank'>{row['word']}</a>"
                #     if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[
                #         'word']
                # ),
                lambda row: (
                   f"<a href='https://api-vast.jrc.service.ec.europa.eu/describe//?url={row['namedEntity']}' target='_blank'>{row['word']}</a>"
                   if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[
                       'word']
                ),
                axis=1
            )

            # Create a new dictionary with the entity information and the link
            dict_annotated_combined_NEL = df_annotated_combined[
                ["end", "entity_group", "score", "start", "entity_with_link"]].to_dict(orient="records")

            # Sort the entities by their start index
            dict_annotated_combined_NEL.sort(key=lambda x: x['start'])

            # Create a dictionary to map entity groups to colors
            entity_colors = {
                "MED": "#E6E6E6",
                "PER": "#FFC0CB",
                "ORG": "#C6F4D6",
                "LOC": "#FFFFCC",
                "MISC": "#F5DEB3"
            }

            text_with_links = text
            offset = 0
            for entity in dict_annotated_combined_NEL:
                start = entity["start"] + offset
                end = entity["end"] + offset
                entity_text = entity["entity_with_link"]
                text_with_links = text_with_links[:start] + entity_text + text_with_links[end:]
                offset += len(entity_text) - (end - start)

            # # Create the text with entities highlighted and linked
            # text_with_links = text
            # offset = 0
            # for entity in dict_annotated_combined_NEL:
            #     start = entity["start"] + offset
            #     end = entity["end"] + offset
            #     entity_text = entity["entity_with_link"]
            #     entity_group = entity["entity_group"]
            #
            #     color = entity_colors.get(entity_group, "#dbeafe")  # Default
            #     darker_color = "#008080"
            #
            #     if "https:" in entity_text:
            #         text_with_links = text_with_links[
            #                           :start] + f'<span style="background-color: {color}; border-radius: 2px; padding: 2px 4px"><a style="color: {darker_color}" href="{entity_text.split(">")[1].split("<")[0]}">{entity_text.split(">")[1].split("<")[0]}</a> <span style="color: {darker_color}; font-size: 0.8em">{entity_group}</span></span>' + text_with_links[
            #                                                                                                                                                                                                                                                                                                                                 end:]
            #         offset += len(
            #             f'<span style="background-color: {color}; border-radius: 2px; padding: 2px 4px"><a style="color: {darker_color}" href="{entity_text.split(">")[1].split("<")[0]}">{entity_text.split(">")[1].split("<")[0]}</a> <span style="color: {darker_color}; font-size: 0.8em">{entity_group}</span></span>') - (
            #                               end - start)
            #         # text_with_links = text_with_links[:start] + f'<span style="background-color: {color}"><a href="{entity_text.split(">")[1].split("<")[0]}">{entity_text.split(">")[1].split("<")[0]}</a></span>' + text_with_links[end:]
            #         # offset += len(
            #         #     f'<span style="background-color: {color}"><a href="{entity_text.split(">")[1].split("<")[0]}">{entity_text.split(">")[1].split("<")[0]}</a></span>') - (
            #         #                       end - start)
            #         #
            #         #     text_with_links = text_with_links[:start] + entity_text + text_with_links[end:]
            #         #     offset += len(entity_text) - (end - start)
            #     else:
            #         text_with_links = text_with_links[
            #                           :start] + f'<span style="background-color: {color}; border-radius: 2px; padding: 2px 4px">{entity_text} <span style="color: {darker_color}; font-size: 0.8em">{entity_group}</span></span>' + text_with_links[end:]
            #         offset += len(
            #             f'<span style="background-color: {color}; border-radius: 2px; padding: 2px 4px">{entity_text} <span style="color: {darker_color}; font-size: 0.8em">{entity_group}</span></span>') - (
            #                           end - start)
            #         # text_with_links = text_with_links[
            #         #                   :start] + f'<span style="background-color: {color}">{entity_text}</span>' + text_with_links[
            #         #                                                                                               end:]
            #         # offset += len(
            #         #     f'<span style="background-color: {color}">{entity_text}</span>') - (end - start)


            # Update state with the DataFrame
            state["df_annotated_combined_dict"] = df_annotated_combined.to_dict()

            if 'ALLURIScontext' in df_annotated_combined.columns:
                # words_for_dropdown = df_annotated_combined[
                #     df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [])][
                #     'word'].unique().tolist()
                words_for_dropdown = df_annotated_combined[df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [] and (isinstance(x, list) and len(x) > 0) and (isinstance(x, list) and (not (len(x) == 1 and not str(x[0]).strip())) ))]['word'].unique().tolist()
                words_for_dropdown = list({entry.lower(): entry for entry in words_for_dropdown}.values())
                words_for_dropdown.insert(0, "")
            else:
                words_for_dropdown = []

            html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text_with_links}</div>"
            state["html_output"] = html_output

            #return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state
            return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state, gr.update(choices=words_for_dropdown), ""

        else:
            html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
            state["html_output"] = html_output
            return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state, [], ""

    else:

        html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
        state["html_output"] = html_output
        return {"text": text, "entities": []}, html_output, state, [], ""


# "FacebookAI/xlm-roberta-large-finetuned-conll03-english",  "Babelscape/wikineural-multilingual-ner",  "blaze999/Medical-NER", "urchade/gliner_large-v2.1", "urchade/gliner_large_bio-v0.1"


def update_urls(selected_word, state):
    if "df_annotated_combined_dict" in state:
    # Convert the state dictionary back into a DataFrame
        df = pd.DataFrame(state["df_annotated_combined_dict"])

        if 'ALLURIScontext' in df.columns:
            # # Filter the DataFrame to get rows where 'ALLURIScontextFromNCBO' is not empty or None
            # valid_entries = df[df['ALLURIScontext'].apply(lambda x: x is not None and x != [])]
            # # Filter the DataFrame to get rows where 'ALLURIScontext' is not None, not an empty list, and not an empty string
            valid_entries = df[df['ALLURIScontext'].apply(lambda x: x is not None and x != []  and (isinstance(x, list) and len(x) > 0) and (isinstance(x, list) and (not (len(x) == 1 and not str(x[0]).strip())) ))]

            # Check if the selected word is in the filtered DataFrame
            if selected_word in valid_entries['word'].values:
                urls = valid_entries.loc[valid_entries['word'] == selected_word, 'ALLURIScontext'].values[0]
                if 'namedEntity' in df.columns:
                    firsturlinlist = df.loc[df['word'] == selected_word, 'namedEntity']
                    firsturlinlist = firsturlinlist.iloc[0] if not firsturlinlist.empty else None
                    if firsturlinlist and firsturlinlist in urls:
                        # Remove the URL from its current position
                        urls.remove(firsturlinlist)
                        # Insert the URL at the first position
                        urls.insert(0, firsturlinlist)

                # Convert list of URLs to HTML string with clickable links
                #html_links = "<br>".join([f'<a href="https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={url}" target="_blank">{url}</a>' for url in urls])
                html_links = "<br>".join([f'<a href="https://api-vast.jrc.service.ec.europa.eu/describe//?url={url}" target="_blank">{url}</a>' for url in urls])
                return html_links
            return ""
        else:
            return""


    else:
        return ""


# demo = gr.Interface(
#     fn=nerBio,
#     inputs=[
#         gr.Textbox(label= "Input text", placeholder="Enter text here..."),
#         gr.CheckboxGroup(models_List, label="ModelsSelection", value=models_List),
#         gr.CheckboxGroup(categories_List, label="CategoriesSelection", value=categories_List),
#         gr.Slider(minimum=0, maximum=1.0, step=0.1, label="Score", value=0.7),
#         gr.Checkbox(label="Enable Named-Entity Linking (NEL)", value=False), #True False
#         #gr.CheckboxGroup(POSSIBLE_KGchoices_List, label="KGchoices Selection", value=POSSIBLE_KGchoices_List, visible=True),
#         gr.Dropdown(POSSIBLE_KGchoices_List, multiselect=True, label="KGchoices Selection", value=POSSIBLE_KGchoices_List),
#         gr.State(value={})
#     ],
#     outputs=[
#         gr.HighlightedText(label="Annotated Text"),
#         gr.HTML(label="Linked Text", show_label=True, visible=True),  #   use gr.HTML to render the annotated text with links , visible
#         gr.State(),
#         gr.Dropdown(label="Annotated Concepts", interactive=True,visible=True),
#         gr.Textbox(label="Linked Entities",interactive=False,visible=True)
#     ],
#     live=True,
#     title="BioAnnotator: Biomedical Named-Entity Recognition (NER) and Linking (NEL)",
#     description="""Interoperability – the capability of systems and organisations to cooperate across functional, sectoral and physical borders – is key for successful digital transformation.
#     The [Interoperable Europe Act](https://interoperable-europe.ec.europa.eu/interoperable-europe/interoperable-europe-act) is an EU regulation that aims to strengthen public sector interoperability and will serve as a main EC policy framework for the years to come.
#     Data exchange is vital for digital government policies, and semantic interoperability ensures systems understand each other despite different legacies and architectures.
#
#     In this demo we show in particular the *BioAnnotator*, a prototype tool performing Biomedical Named-Entity Recognition (NER) and Linking (NEL). To give it a try, please select one or more NER models and enter some text to get it processed. Please select also the entity categories you want to extract, as well as the score to use as a threshold for the NER extraction. Finally, select whether you want to perform Named-Entity Linking (NEL) and if you want to enable the filtering to some specific biomedical ontologies only (acronyms description at: https://bioportal.bioontology.org/ontologies). See also: [InventoryHealthKGs.pdf](https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/ETOHA/KGs/InventoryHealthKGs.pdf).
#     """,
#     examples=examples,
#     cache_examples=False,
#     article="""
#     **Categories Legend:**
#     - MED  | Medical
#     - LOC  | Locations
#     - PER  | Persons
#     - ORG  | Organizations
#     - MISC | Miscellanea
#     - CONC | Concepts & Ideas
#     - BIOP | Biological
#     - ACTI | Activities & Behaviors
#     - ANAT | Anatomy
#     - CHEM | Chemicals & Drugs
#     - DEVI | Devices
#     - DISO | Disorders
#     - GENE | Genes & Molecular Sequences
#     - GEOG | Geographic Areas
#     - LIVB | Living Beings
#     - OBJC | Objects
#     - OCCU | Occupations
#     - ORGA | Organizations
#     - PHEN | Phenomena
#     - PHYS | Physiology
#     - PROC | Procedures
#     """
# )


# Define the Gradio interface using Blocks
#description="This application performs biomedical named-entity recognition and linking."
with gr.Blocks(title="BioAnnotator") as demo:

    gr.Markdown("# BioAnnotator: Biomedical Named-Entity Recognition (NER) and Linking (NEL)")
    gr.Markdown("""
        This application performs biomedical named-entity recognition and linking.

        **Description:**
        *Interoperability* – the capability of systems and organisations to cooperate across functional, sectoral and physical borders – is key for successful digital transformation. 
        The [Interoperable Europe Act](https://interoperable-europe.ec.europa.eu/interoperable-europe/interoperable-europe-act) is an EU regulation that aims to strengthen public sector interoperability and will serve as a main EC policy framework for the years to come. 
        Data exchange is vital for digital government policies, and semantic interoperability ensures systems understand each other despite different legacies and architectures.

        In this demo we show in particular the *BioAnnotator*, a prototype tool performing Biomedical Named-Entity Recognition (NER) and Linking (NEL). To give it a try, please select one or more NER models and enter some text to get it processed. Please select also the entity categories you want to extract, as well as the score to use as a threshold for the NER extraction. Finally, select whether you want to perform Named-Entity Linking (NEL) and if you want to enable the filtering to some specific biomedical ontologies only (acronyms description at: https://bioportal.bioontology.org/ontologies). See also: [InventoryHealthKGs.pdf](https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/ETOHA/KGs/InventoryHealthKGs.pdf).
        """)


    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(label="Input text", placeholder="Enter text here...")
            models_selection = gr.CheckboxGroup(models_List, label="ModelsSelection", value=models_List)
            categories_selection = gr.CheckboxGroup(categories_List, label="CategoriesSelection", value=categories_List)
            score_slider = gr.Slider(minimum=0, maximum=1.0, step=0.05, label="Score", value=0.75)
            nel_checkbox = gr.Checkbox(label="Enable Named-Entity Linking (NEL)", value=False)
            kgchoices_selection = gr.Dropdown(POSSIBLE_KGchoices_List, multiselect=True, label="KGchoices Selection", value=POSSIBLE_KGchoices_List)
            state = gr.State(value={})

        with gr.Column():
            annotated_text = gr.HighlightedText(label="Annotated Text")
            linked_text = gr.HTML(label="Linked Text", show_label=True, visible=True)
            word_dropdown = gr.Dropdown(label="Annotated Concepts", show_label=True, visible=True, interactive=True)
            urls_html = gr.HTML(label="Linked Entities", show_label=True, visible=True)

    ## Define the interactions
    #text_input.change(fn=nerBio, inputs=[text_input, models_selection, categories_selection, score_slider, nel_checkbox, kgchoices_selection, state], outputs=[annotated_text, linked_text, state, word_dropdown, urls_html])
    # Define the interactions for all inputs
    inputs = [text_input, models_selection, categories_selection, score_slider, nel_checkbox, kgchoices_selection]
    for input_component in inputs:
        input_component.change(fn=nerBio,
                               inputs=[text_input, models_selection, categories_selection, score_slider, nel_checkbox,
                                       kgchoices_selection, state],
                               outputs=[annotated_text, linked_text, state, word_dropdown, urls_html])

    word_dropdown.change(fn=update_urls, inputs=[word_dropdown, state], outputs=urls_html)

    # Add examples
    gr.Examples(examples=examples,
                inputs=[text_input, models_selection, categories_selection, score_slider, nel_checkbox,
                        kgchoices_selection])

    gr.Markdown("""
        **Categories Legend:**
        - MED  | Medical
        - LOC  | Locations
        - PER  | Persons
        - ORG  | Organizations
        - MISC | Miscellanea
        - CONC | Concepts & Ideas 
        - BIOP | Biological 
        - ACTI | Activities & Behaviors
        - ANAT | Anatomy
        - CHEM | Chemicals & Drugs
        - DEVI | Devices
        - DISO | Disorders
        - GENE | Genes & Molecular Sequences
        - GEOG | Geographic Areas
        - LIVB | Living Beings
        - OBJC | Objects
        - OCCU | Occupations
        - ORGA | Organizations
        - PHEN | Phenomena
        - PHYS | Physiology
        - PROC | Procedures
        """)



demo.launch()
#demo.launch(share=True)  # Share your demo with just 1 extra parameter