Spaces:

jrc-ai
/

MultiNER-simplified

Running

File size: 53,424 Bytes

import os

#os.environ["CUDA_VISIBLE_DEVICES"] = "1,6" # to use the GPUs 3,4 only

#os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
#os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
#os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"

from transformers import file_utils
print(file_utils.default_cache_path)

import pandas as pd
from tqdm import tqdm
from gliner import GLiNER
import logging
from jinja2 import Template
from collections import Counter

from transformers import pipeline, AutoTokenizer

#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

#import html

import torch
torch.cuda.empty_cache()  # Clear cache ot torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}...")
if device.type == "cuda":
    print("GPU number:", torch.cuda.current_device())

import datasets

import argparse
import json
import random
import numpy as np

import tiktoken
from langchain.text_splitter import TokenTextSplitter

import gradio as gr
import re
from common import strtobool, token_counter, encoding_getter, strip_quotes
from nerBio import annotate, entitiesFusion, is_cross_inside, elinking
from llmqueryNer import call_model, call_model_with_caching, process_list, setup_gptjrc, api_call_gptjrc, model_list_gptjrc


from joblib import Memory

cachedir = 'cached'
mem = Memory(cachedir, verbose=False)

# this is to completely delete the cache:
# mem.clear(warn=False)





examples = [
["He said the disease was 1st detected to have spread to Malaysia from Africa in 1997. " , None],
["He said the disease which was 1st detected in Johor had spread to Negeri Sembilan, Melaka, Perak, Selangor and the latest Kedah.  He said the disease was 1st detected to have spread to Malaysia from Africa in 1997. ", None],
["The Health Ministry has detected about 4000 suspected chikungunya cases nationwide this year [2008], Minister Datuk Liow Tiong Lai said Wednesday [17 Dec 2008]. ", None],
["The Health Ministry has detected about 4000 suspected chikungunya cases nationwide this year [2008], Minister Datuk Liow Tiong Lai said Wednesday [17 Dec 2008]. He said the disease which was 1st detected in Johor had spread to Negeri Sembilan, Melaka, Perak, Selangor and the latest Kedah. \"So far, the chikungunya disease is still under control nationwide,\" he told reporters after visiting Sultanah Nur Zahirah Hospital here. Present was Terengganu Health Director Dr. Nordiyanah Hassan. Liow said that so far, there is no specific medicine to treat the chikungunya fever disease spread by _Aedes_ mosquito. \"So, I would like to call on the public to be careful particularly during the wet season now because _Aedes_ mosquito is easy to breed,\" he said. To contain the spread of the disease, he said, the ministry had taken several measures including intensifying the campaign to rid of _Aedes_ mosquito and holding lectures on the outbreak. He said the disease was 1st detected to have spread to Malaysia from Africa in 1997. Meanwhile, he said 63 health projects costing RM458 million [USD 131 230 211] had been approved for implementation in Terengganu under the Ninth Malaysia Plan and some had started.", None],
["Carcinoma", None],
["The doctor diagnosed the patient with basal cell carcinoma, a common type of skin cancer.", None],
["West Nile virus", None],
["Legionellosis", None],
["Eight years ago I started with Fosamax for 3-4 years and then took Actonel. In March, I decided not to take Actonel any longer. I had been on it for too long and was fearful of esophageal cancer and bone breakage. Now my doctor wants me to take the Prolia injections, which I am not going to do. I am not going to continue with any drugs. My bone density recently done was in the minuses. I do work with a personal trainer and execise daily. I am searching for alternative ways to deal with this problem.", None],
["Does Chicago have any stores and does Joe live here?", None],
["Cholera has been reported every week since November 1994. By 5 November 1995 at total of 12,344 with 245 deaths have been notified. Of these, 879 cases with 4 deaths were reported for the period 9 October to 5 November 1995. Control efforts have not succeeded in preventing the spread of the epidemic and when cases were detected on Sao Nicolau and Sal Islands in the period 9 October to 5 November all nine inhabited islands of Cap Verde had become infected. The last cholera epidemic in Cap Verde occurred in 1979. (See also Weekly Epidemiological Record No. 44, 3 November 1995) Côte d'Ivoire: A cholera outbreak which started in September 1995 caused 2,027 cases and 150 deaths up to 12 November 1995. The first cases were reported in Department de l'Ouest on 18 September 1995. Cases were subsequently reported in Department de Nord and most recently in Department du Centre and Department de Sud. The WHO Representative assisted in the organization of a team to visit the area and evaluate the situation as well as arranging for medical supplies. (1.12.95) Iran, Islamic Republic of,: Kordestan Province has been declared free of cholera. (1.12.95) Iraq: An outbreak of cholera reported from Sulaimaniyah Governorate in Northern Iraq has resulted in 519 cases, 264 of which have been confirmed, and 3 deaths to date. Vibrio cholerae O1 serotype Ogawa has been isolated. At the request of the Iraqi Ministry of Health, a WHO consultant has been sent to the area to assess and monitor the situation, provide guidance to the health authorities, and coordinate inputs by non-governmental organizations. WHO has also made available essential treatment supplies. An intensive media campaign to raise public awareness about essential preventive measures has been successful in containing the spread of the outbreak. (1.12.95) Senegal: Despite the fact that cholera has been endemic in countries bordering Senegal for the past two years, no cases were reported from Senegal until mid- August 1995. Between 15 August and 17 November 1995, 852 case and 43 deaths were notified. A further 731 cases with 37 deaths have been reported for the period 1 September to 12 November. Most cases were in the Departments of Dakar and Pikine in the Dakar Region and recently also Departments of Mbacke and Touba in Diourbel Region. ", None],
]



models_List = ["FacebookAI/xlm-roberta-large-finetuned-conll03-english",  "Babelscape/wikineural-multilingual-ner",  "blaze999/Medical-NER", "urchade/gliner_large-v2.1", "urchade/gliner_large_bio-v0.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1",  "knowledgator/gliner-multitask-large-v0.5"
#models_List = ["Babelscape/wikineural-multilingual-ner",  "urchade/gliner_large-v2.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1",  "knowledgator/gliner-multitask-large-v0.5"
#models_List = ["NCBO/BioPortal" ]

#categories_List = ["MED","LOC","PER","ORG","DATE","MISC"]
categories_List = ["MED","LOC","PER","ORG","DATE","MISC", "CONC", "BIOP", "ACTI", "ANAT", "CHEM",  "DEVI", "DISO", "GENE", "GEOG", "LIVB", "OBJC", "OCCU", "ORGA", "PHEN", "PHYS" , "PROC"]

POSSIBLE_KGchoices_List = ["AI", "AIO", "AEO", "BFO", "BIM", "BCGO", "CL", "CHIRO", "CHEBI", "DCM", "FMA", "GO", "GENO",
             "GeoSPARQL", "HL7", "DOID", "HP", "HP_O", "IDO", "IAO", "ICD10", "LOINC", "MESH",
             "MONDO", "NCIT", "NCBITAXON", "NCBITaxon_", "NIFCELL", "NIFSTD", "GML", "OBCS", "OCHV", "OHPI",
             "OPB", "TRANS", "PLOSTHES", "RADLEX", "RO", "STY", "SO", "SNOMED", "STATO",
             "SYMP", "FoodOn", "UBERON", "ORDO", "HOOM", "VO", "OGMS", "EuroSciVoc"]


modelGliner=None
modelGlinerBio=None

num_cores_Gliner_forDemo = 0  # 0 means use the GPU for Gliner !
tokenizerGliner = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')


encod = encoding_getter('microsoft/deberta-v3-large')
text_splitter = TokenTextSplitter(
    # separators=separators,
    encoding_name=encod.name,
    chunk_size=80000,
    chunk_overlap=50,
    length_function=len,
    add_start_index=True,
)

pipe_dict = {}
for modelName in models_List:
    tsk = "token-classification"
    if (("/gliner" in modelName) == False) and (("NCBO" in modelName) == False):
        pipe = pipeline(
            tsk,
            model=modelName,
            aggregation_strategy="simple",
            device=device,
        )
        pipe_dict[modelName] = pipe
    elif ("/gliner" in modelName):
        if not tokenizerGliner:
            tokenizerGliner = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')
        if "_bio-" in modelName:
            if num_cores_Gliner_forDemo > 0:
                modelGlinerBio = GLiNER.from_pretrained(modelName)  # "urchade/gliner_large_bio-v0.1")
            else:
                modelGlinerBio = GLiNER.from_pretrained(modelName, map_location=device)
        else:
            if num_cores_Gliner_forDemo > 0:
                modelGliner = GLiNER.from_pretrained(
                    modelName)  # "knowledgator/gliner-multitask-large-v0.5" - "urchade/gliner_large-v2.1"
            else:
                modelGliner = GLiNER.from_pretrained(modelName, map_location=device)


#### GPT@JRC API
#if args.service_provider == "gptjrc":
key_gptjrc = ""
fkeyname = "GPTJRC-APItoken.key"
if os.path.exists(fkeyname):
    with open(fkeyname) as f:
        key_gptjrc = f.read()
else:
    key_gptjrc = os.environ['key_gptjrc']
if key_gptjrc and key_gptjrc != "":
    setup_gptjrc(key_gptjrc)
#####


# Add this function to handle dropdown selection
def get_urls(word, df_annotated_combined):
    # Filter the DataFrame to get rows where 'ALLURIScontext' is not empty or None
    #valid_entries = df_annotated_combined[df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [])]
    valid_entries = df_annotated_combined[df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [] and (
                isinstance(x, list) and (isinstance(x, list) and len(x) > 0) and (not (len(x) == 1 and not str(x[0]).strip()))))]

    # Check if the word is in the filtered DataFrame
    if word in valid_entries['word'].values:
        urls = valid_entries.loc[valid_entries['word'] == word, 'ALLURIScontext'].values[0]

        if 'namedEntity' in df_annotated_combined.columns:
            firsturlinlist = df_annotated_combined.loc[df_annotated_combined['word'] == word, 'namedEntity']
            firsturlinlist = firsturlinlist.iloc[0] if not firsturlinlist.empty else None
            if firsturlinlist and firsturlinlist in urls:
                # Remove the URL from its current position
                urls.remove(firsturlinlist)
                # Insert the URL at the first position
                urls.insert(0, firsturlinlist)

        #html_links = "<br>".join([f'<a href="https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={url}" target="_blank">{url}</a>' for url in urls])
        html_links = "<br>".join([f'<a href="https://api-vast.jrc.service.ec.europa.eu/describe//?url={url}" target="_blank">{url}</a>' for url in urls])
        return html_links
    return ""




###@mem.cache
def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking, KGchoices, state: dict):

    if EntityLinking:
        EnableNEL="True"
    else:
        EnableNEL="False"

    if not text:
        html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
        state = {
            "text": "",
            "df_annotated_dict": dict(),
            "df_annotated_combined_dict": dict(),
            "KGchoices": KGchoices,
            "ModelsSelection": ModelsSelection,
            "ScoreFilt": ScoreFilt,
            "EntityLinking": EntityLinking,
            "html_output": html_output
        }
        return {"text": text, "entities": []}, html_output, state, [], ""

    df_annotated = pd.DataFrame()

    parser = argparse.ArgumentParser()

    parser.add_argument("--model_id", type=str, default=models_List[0], help="model to use")

    parser.add_argument("--debug", type=str, default="True", help="set debug mode")

    parser.add_argument("--source_column", type=str, default="ContextToAnnotate")

    parser.add_argument("--entities_filter_threshold", type=int, default=ScoreFilt)

    parser.add_argument("--SEED", type=int, default=41)
    parser.add_argument("--batch_size", type=int, default=32)  # 4 - 8 - 16
    parser.add_argument("--num_cores_Gliner", type=int, default=num_cores_Gliner_forDemo, help="parallel processing for Gliner annotation")  # 0 means use the GPU for Gliner !

    parser.add_argument("--entity_linking", type=str, default=EnableNEL, help="whether to make entities linking or not")
    parser.add_argument("--geonameskey_filename", type=str, default="GEONAMES-API.key", help="file location where it is stored the geonames api key")
    parser.add_argument("--virtuosokey_filename", type=str, default="VIRTUOSO-dba.key", help="file location where it is stored the virtuoso endpoint dba pwd")
    parser.add_argument("--bioportalkey_filename", type=str, default="NCBO-BioPortal.key", help="file location where it is stored the NCBO  BioPortal api key")

    # consose 20250205:
    # KGchoices = None
    # KGchoices = ['SNOMED', 'LOINC', 'ICD10', 'NCIT']
    # KGchoices = ['SNOMED', 'LOINC', 'ICD10', 'MESH', 'NCIT']  # restricts the input to these values only
    if KGchoices:
        KGchoices.sort()
    parser.add_argument("--KG_restriction", nargs='+', choices=KGchoices, default=KGchoices,
                        help="List of ontologies to which restrict the entity linking task.")
    #consose 20250502:
    if Counter(KGchoices) == Counter(POSSIBLE_KGchoices_List):
        parser.add_argument("--USE_CACHE", type=str, default="True",
                            help="whether to use cache for the NER and NEL tasks or not")
    else:
        #print("Lists do not have the same elements")
        parser.add_argument("--USE_CACHE", type=str, default="False",
                            help="whether to use cache for the NER and NEL tasks or not")

    parser.add_argument("--num_cores_eLinking", type=int, default=10, help="parallel processing for the entity linking process")

    parser.add_argument("--computeEntityContext", type=str, default="False",
                        help="whether to extract a readable context from the extracted triples for the concept")
    parser.add_argument("--computeEntityGlobalContext", type=str, default="False",
                        help="whether to extract a readable context from the extracted triples of all the entities extracted from the endpoint for the concept")
    parser.add_argument("--maxTriplesGlobalContext", type=int, default=20000,
                        help="maximum number of triples to consider for global context computation")  # if 0 or None it is not considered
    parser.add_argument("--UseRetrieverForContextCreation", type=str, default="True",
                        help="whether to use a retriever for the creation of the context of the entities from the triples coming from the KGs")

    parser.add_argument("--service_provider", type=str, default="gptjrc", help="llm service provider")
    parser.add_argument("--model_name", type=str, default="llama-3.1-70b-instruct", help="llm to use")
    parser.add_argument("--tokens_max", type=int, default=80000, help="max number of tokens to supply to the llm")

    parser.add_argument("--temperature", type=int, default=0.01)


    args = parser.parse_args()

    df_ToAnnotate = pd.DataFrame()

    previous_text = ""
    previous_df_annotated_dict = dict()
    previous_kg_choices = []
    if state:
        previous_text = state.get("text", "")
        previous_df_annotated_dict = state.get("df_annotated_dict", {})
        previous_df_annotated_combined_dict = state.get("df_annotated_combined_dict", {})
        previous_kg_choices = state.get("KGchoices", [])
        previous_ModelsSelection = state.get("ModelsSelection", [])
        previous_ScoreFilt_from_state = float(state.get("ScoreFilt", ScoreFilt))  # Ensure ScoreFilt is a float
        previous_EntityLinking_from_state = bool(state.get("EntityLinking", EntityLinking))  # Ensure EntityLinking is a boolean
        previous_html_output = state.get("html_output", "")


        if  previous_html_output and (previous_df_annotated_dict) and (previous_df_annotated_combined_dict) and (previous_text == text) and (sorted(previous_kg_choices) == sorted(KGchoices)) and (sorted(previous_ModelsSelection) == sorted(ModelsSelection)) and (previous_ScoreFilt_from_state == ScoreFilt) and (previous_EntityLinking_from_state == EntityLinking):
            ddf_annot_prev = pd.DataFrame(previous_df_annotated_combined_dict)
            if 'ALLURIScontext' in ddf_annot_prev.columns:
                # words_for_dropdown = df_annotated_combined[
                #     df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [])][
                #     'word'].unique().tolist()
                words_for_dropdown = ddf_annot_prev[ddf_annot_prev['ALLURIScontext'].apply(
                    lambda x: x is not None and x != [] and (isinstance(x, list) and len(x) > 0) and (
                                isinstance(x, list) and (not (len(x) == 1 and not str(x[0]).strip()))))][
                    'word'].unique().tolist()
                words_for_dropdown = list({entry.lower(): entry for entry in words_for_dropdown}.values())
                words_for_dropdown.insert(0, "")
            else:
                words_for_dropdown = []

            dict_annotated_combined_NER = ddf_annot_prev[
                ["end", "entity_group", "score", "start", "word"]].to_dict(orient="records")

            # return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state
            return {"text": text, "entities": dict_annotated_combined_NER}, previous_html_output, state, gr.update(
                choices=words_for_dropdown), ""





    #print("Are all models in any row of the 'model' column, case-insensitively?", all_models_in_any_row)
    #if (not history_dict) or (history_dict[args.source_column][0] != text) or (all_models_in_any_row == False):
    #if (not history_dict) or (history_dict[args.source_column][0] != text):
    if (not previous_df_annotated_dict) or (previous_text != text) or (sorted(previous_kg_choices) != sorted(KGchoices) ):

        for model_id in models_List:  # always do all the annotations, only filter them afterwards
        #for model_id in ModelsSelection:

            # if history_dict and (history_dict[args.source_column][0] == text):
            #     if model_id in hhist['model'].unique():
            #         continue

            parser.set_defaults(model_id=model_id)

            args = parser.parse_args()

            print("ARGS:")
            print(args)

            # %% n machine learning tasks, particularly when dealing with models that have stochasticity involved (like text generation), it's important to set seeds for random number generators to ensure reproducibility of results. In the case of using models from the transformers library, you need to set seeds for both Python's random module, NumPy, and PyTorch to ensure that the results are the same every time you run the code.
            # Before you create the pipeline and run the text generation, set the seeds like this:
            random.seed(args.SEED)
            np.random.seed(args.SEED)
            torch.manual_seed(args.SEED)
            torch.cuda.manual_seed_all(args.SEED)
            ###

            df_ToAnnotate = pd.DataFrame({ "ToLink": [None], args.source_column: [text]})

            if "SentenceRef" not in df_ToAnnotate.columns:
                df_ToAnnotate["SentenceRef"] = None
                df_ToAnnotate = df_ToAnnotate[['SentenceRef'] + [col for col in df_ToAnnotate.columns if
                                                                 col != 'SentenceRef']]  # this moves it to the first position

            df_ToAnnotate['SentenceRef'] = df_ToAnnotate.index + 1
            df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].argsort().groupby(df_ToAnnotate[args.source_column]).transform('min').astype(int)
            df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].rank(method='dense').astype(int)

            # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            # if strtobool(args.debug):
            #     print(f"Device: {device}...")
            #     if device.type == "cuda":
            #         print("GPU number:", torch.cuda.current_device())

            pipeToUse = None
            if (("gliner" in args.model_id) == False) and (("NCBO" in args.model_id)== False) :
                pipeToUse = pipe_dict[args.model_id]

            new_annotations = annotate(df_ToAnnotate, args, pipeToUse, tokenizerGliner, modelGliner, modelGlinerBio, device)
            if not new_annotations.empty:
                if df_annotated.empty:
                    # If df_annotated is empty, just assign new_annotations to it
                    df_annotated = new_annotations
                else:
                    # If df_annotated is not empty, concatenate new_annotations to it
                    df_annotated = pd.concat([df_annotated, new_annotations], ignore_index=True)

        state = {
            "text": text,
            "df_annotated_dict": df_annotated.to_dict(),
            "df_annotated_combined_dict": dict(),
            "KGchoices": KGchoices,
            "ModelsSelection": ModelsSelection,
            "ScoreFilt": ScoreFilt,
            "EntityLinking": EntityLinking,
            "html_output": ""
        }

    else:

        print("ARGS:")
        print(args)

        # %% n machine learning tasks, particularly when dealing with models that have stochasticity involved (like text generation), it's important to set seeds for random number generators to ensure reproducibility of results. In the case of using models from the transformers library, you need to set seeds for both Python's random module, NumPy, and PyTorch to ensure that the results are the same every time you run the code.
        # Before you create the pipeline and run the text generation, set the seeds like this:
        random.seed(args.SEED)
        np.random.seed(args.SEED)
        torch.manual_seed(args.SEED)
        torch.cuda.manual_seed_all(args.SEED)
        ###

        history = pd.DataFrame(previous_df_annotated_dict)
        df_annotated = history.copy()

        state = {
            "text": text,
            "df_annotated_dict": df_annotated.to_dict(),
            "df_annotated_combined_dict": dict(),
            "KGchoices": KGchoices,
            "ModelsSelection": ModelsSelection,
            "ScoreFilt": ScoreFilt,
            "EntityLinking": EntityLinking,
            "html_output": ""
        }


    quoted_text = text.startswith('"') & text.endswith('"')
    if (not df_annotated.empty) or quoted_text:

        if (not df_annotated.empty):
            # filter now per models selection
            df_annotated = df_annotated[df_annotated['model'].str.lower().isin([model.lower() for model in ModelsSelection])]
            if df_annotated.empty and quoted_text==False:
                html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
                state["html_output"] = html_output
                return {"text": text, "entities": []}, html_output, state, [], ""

        df_annotated_combined = pd.DataFrame()
        if (not df_annotated.empty):
            df_annotated_combined = entitiesFusion(df_annotated,args)
            if df_annotated_combined.empty and quoted_text==False:
                html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
                state["html_output"] = html_output
                return {"text": text, "entities": []}, html_output, state, [], ""
            else:
                if (not df_annotated.empty):
                    df_annotated_combined = is_cross_inside(df_annotated_combined, args, 0.999)  #I cut all the cross inside with the 0.99. to avoid the linking


        cache_prefix_fp = "LLMQUERYNER"
        cache_nameLLMs = cache_prefix_fp + "___" + "__".join(
            [args.service_provider, args.model_name, str(args.temperature)]).replace(
            " ", "_") + ".json"

        load_map_query_input_output = None
        if strtobool(args.USE_CACHE):
            if os.path.exists(cache_nameLLMs):
                with open(cache_nameLLMs) as f:
                    load_map_query_input_output = json.load(f)
            else:
                load_map_query_input_output = {}

        ### entity linking part:
        if strtobool(args.entity_linking):

            cache_map_geonames = None
            if strtobool(args.USE_CACHE):
                cache_filename = "CACHE_geonames.json"
                if os.path.exists(cache_filename):
                    with open(cache_filename) as f:
                        cache_map_geonames = json.load(f)
                else:
                    cache_map_geonames = {}

            key_geonames = ""
            if args.geonameskey_filename and os.path.exists(args.geonameskey_filename):
                fkeyname = args.geonameskey_filename
                with open(fkeyname) as f:
                    key_geonames = f.read()
            else:
                key_geonames = os.environ['key_geonames']

            cache_map_virtuoso = None
            if strtobool(args.USE_CACHE):
                cacheVirtuoso_filename = "CACHE_virtuoso.json"
                if os.path.exists(cacheVirtuoso_filename):
                    with open(cacheVirtuoso_filename) as f:
                        cache_map_virtuoso = json.load(f)
                else:
                    cache_map_virtuoso = {}

            key_virtuoso = ""
            if args.virtuosokey_filename and os.path.exists(args.virtuosokey_filename):
                fkeyname = args.virtuosokey_filename
                with open(fkeyname) as f:
                    key_virtuoso = f.read()
            else:
                key_virtuoso = os.environ['key_virtuoso']


            # Here for the EXACT MATCHING "" - if the desired term has not been identified in the NER, add to the dataframe:

            if df_ToAnnotate.empty:
                df_ToAnnotate = pd.DataFrame({"ToLink": [None], args.source_column: [text]})

                if "SentenceRef" not in df_ToAnnotate.columns:
                    df_ToAnnotate["SentenceRef"] = None
                    df_ToAnnotate = df_ToAnnotate[['SentenceRef'] + [col for col in df_ToAnnotate.columns if
                                                                     col != 'SentenceRef']]  # this moves it to the first position

                df_ToAnnotate['SentenceRef'] = df_ToAnnotate.index + 1
                df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].argsort().groupby(
                    df_ToAnnotate[args.source_column]).transform('min').astype(int)
                df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].rank(method='dense').astype(int)

            # Define the condition to find missing SentenceRefs
            missing_sentence_refs = ~df_ToAnnotate['SentenceRef'].isin(df_annotated_combined['SentenceRef'])

            # Define the condition to check if ContextToAnnotate starts and ends with quotes
            quoted_context = df_ToAnnotate[args.source_column].str.startswith('"') & df_ToAnnotate[
                args.source_column].str.endswith('"')

            # Combine both conditions
            condition = missing_sentence_refs & quoted_context

            # Select rows from df_ToAnnotate that meet the condition
            rows_to_add = df_ToAnnotate[condition]

            rows_to_add['model'] = "Forced"
            rows_to_add['entity_group'] = "MISC"
            rows_to_add['word'] = rows_to_add[args.source_column]
            rows_to_add['word'] = rows_to_add[args.source_column].apply(strip_quotes)
            rows_to_add['score'] = 1.0
            rows_to_add['start'] = int(1)
            rows_to_add['end'] = rows_to_add['word'].apply(len) + int(1)
            rows_to_add['IsGeo'] = None
            rows_to_add['IsBio'] = None
            rows_to_add['IsCrossInside'] = 0.0

            if df_annotated_combined.empty:
                df_annotated_combined = pd.DataFrame(columns=df_ToAnnotate.columns)

            # Append these rows to df_annotated_combined
            df_annotated_combined = pd.concat([df_annotated_combined, rows_to_add], ignore_index=True)

            df_annotated_combined['start'] = df_annotated_combined['start'].astype(int)
            df_annotated_combined['end'] = df_annotated_combined['end'].astype(int)

            df_annotated_combined = df_annotated_combined.sort_values(
                by=['SentenceRef', 'start', 'ToLink', 'word', 'score'],
                ascending=[True, True, True, True, False])

            # Now df_annotated_combined contains the additional rows

            df_annotated_combined, cache_map_geonames_AFTER, cache_map_virtuoso_AFTER, load_map_query_input_output_AFTER = elinking(df_annotated_combined,
                                                                                                                                 text_splitter, args, key_geonames,
                                                                                                                                 cache_map_geonames,
                                                                                                                                 key_virtuoso,
                                                                                                                                 cache_map_virtuoso,
                                                                                                                                    load_map_query_input_output,
                                                                                                                                    device)

            if strtobool(args.USE_CACHE):
                if cache_map_geonames_AFTER is not None:
                    with open(cache_filename, "w") as f:
                        json.dump(cache_map_geonames_AFTER, f)

                if cache_map_virtuoso_AFTER is not None:
                    with open(cacheVirtuoso_filename, "w") as f:
                        json.dump(cache_map_virtuoso_AFTER, f)

                if load_map_query_input_output_AFTER is not None:
                    with open(cache_nameLLMs, "w") as f:
                        json.dump(load_map_query_input_output_AFTER, f)

            ### end entity linking part


        ### filter by selected category only
        # #df_annotated_combined = df_annotated_combined[df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in CategoriesSelection])]
        # if "MED" in CategoriesSelection:
        #     filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
        #         [cat.lower() for cat in CategoriesSelection]) | (df_annotated_combined['IsBio'] == 1)
        # else:
        #     filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
        #         [cat.lower() for cat in CategoriesSelection])
        # df_annotated_combined = df_annotated_combined[filter_mask]
        #
        # if "MED" in CategoriesSelection:
        #     filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
        #         [cat.lower() for cat in CategoriesSelection]) | (df_annotated_combined['IsBio'] == 1)
        # elif "OTHER" in CategoriesSelection:
        #     filter_mask = ~(
        #         df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List]))
        # else:
        #     filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
        #         [cat.lower() for cat in CategoriesSelection])

        filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
            [cat.lower() for cat in CategoriesSelection])
        if "MED" in CategoriesSelection:
            filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) & (df_annotated_combined['IsBio'] == 1)
        if "MISC" in CategoriesSelection:
            # filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List]))
            # filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) & ~(df_annotated_combined['IsBio'] == 1)  # with this cluase, I'm including not only the categories labelled as MISC, but also the other that are not MED, PER, ORG, LOC
            filter_mask |= ~(
                df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) & ~(
                        df_annotated_combined[
                            'IsBio'] == 1)  # with this cluase, I'm including not only the categories labelled as MISC, but also the other that are not MED, PER, ORG, LOC

        df_annotated_combined = df_annotated_combined[filter_mask]
        if df_annotated_combined.empty:
            html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
            state["html_output"] = html_output
            return {"text": text, "entities": []}, html_output, state, [], ""

        ###

        #df_annotated_combined = is_cross_inside(df_annotated_combined, args)

        if 'IsCrossInside' in df_annotated_combined.columns:
            df_annotated_combined = df_annotated_combined[df_annotated_combined['IsCrossInside'] != 1]
            if df_annotated_combined.empty:
                html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
                state["html_output"] = html_output
                return {"text": text, "entities": []}, html_output, state, [], ""

        dict_annotated_combined_NER = df_annotated_combined[["end", "entity_group", "score", "start", "word"]].to_dict(orient="records")

        ### continue linking part:
        if strtobool(args.entity_linking):
            # ##### this is to pass the links:

            # # Create a new column for the entities with links
            df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
                # lambda row: (
                #     f"<a href='https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={row['namedEntity']}' target='_blank'>{row['word']}</a>"
                #     if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[
                #         'word']
                # ),
                lambda row: (
                   f"<a href='https://api-vast.jrc.service.ec.europa.eu/describe//?url={row['namedEntity']}' target='_blank'>{row['word']}</a>"
                   if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[
                       'word']
                ),
                axis=1
            )

            # Create a new dictionary with the entity information and the link
            dict_annotated_combined_NEL = df_annotated_combined[
                ["end", "entity_group", "score", "start", "entity_with_link"]].to_dict(orient="records")

            # Sort the entities by their start index
            dict_annotated_combined_NEL.sort(key=lambda x: x['start'])

            # Create a dictionary to map entity groups to colors
            entity_colors = {
                "MED": "#E6E6E6",
                "PER": "#FFC0CB",
                "ORG": "#C6F4D6",
                "LOC": "#FFFFCC",
                "MISC": "#F5DEB3"
            }

            text_with_links = text
            offset = 0
            for entity in dict_annotated_combined_NEL:
                start = entity["start"] + offset
                end = entity["end"] + offset
                entity_text = entity["entity_with_link"]
                text_with_links = text_with_links[:start] + entity_text + text_with_links[end:]
                offset += len(entity_text) - (end - start)

            # # Create the text with entities highlighted and linked
            # text_with_links = text
            # offset = 0
            # for entity in dict_annotated_combined_NEL:
            #     start = entity["start"] + offset
            #     end = entity["end"] + offset
            #     entity_text = entity["entity_with_link"]
            #     entity_group = entity["entity_group"]
            #
            #     color = entity_colors.get(entity_group, "#dbeafe")  # Default
            #     darker_color = "#008080"
            #
            #     if "https:" in entity_text:
            #         text_with_links = text_with_links[
            #                           :start] + f'<span style="background-color: {color}; border-radius: 2px; padding: 2px 4px"><a style="color: {darker_color}" href="{entity_text.split(">")[1].split("<")[0]}">{entity_text.split(">")[1].split("<")[0]}</a> <span style="color: {darker_color}; font-size: 0.8em">{entity_group}</span></span>' + text_with_links[
            #                                                                                                                                                                                                                                                                                                                                 end:]
            #         offset += len(
            #             f'<span style="background-color: {color}; border-radius: 2px; padding: 2px 4px"><a style="color: {darker_color}" href="{entity_text.split(">")[1].split("<")[0]}">{entity_text.split(">")[1].split("<")[0]}</a> <span style="color: {darker_color}; font-size: 0.8em">{entity_group}</span></span>') - (
            #                               end - start)
            #         # text_with_links = text_with_links[:start] + f'<span style="background-color: {color}"><a href="{entity_text.split(">")[1].split("<")[0]}">{entity_text.split(">")[1].split("<")[0]}</a></span>' + text_with_links[end:]
            #         # offset += len(
            #         #     f'<span style="background-color: {color}"><a href="{entity_text.split(">")[1].split("<")[0]}">{entity_text.split(">")[1].split("<")[0]}</a></span>') - (
            #         #                       end - start)
            #         #
            #         #     text_with_links = text_with_links[:start] + entity_text + text_with_links[end:]
            #         #     offset += len(entity_text) - (end - start)
            #     else:
            #         text_with_links = text_with_links[
            #                           :start] + f'<span style="background-color: {color}; border-radius: 2px; padding: 2px 4px">{entity_text} <span style="color: {darker_color}; font-size: 0.8em">{entity_group}</span></span>' + text_with_links[end:]
            #         offset += len(
            #             f'<span style="background-color: {color}; border-radius: 2px; padding: 2px 4px">{entity_text} <span style="color: {darker_color}; font-size: 0.8em">{entity_group}</span></span>') - (
            #                           end - start)
            #         # text_with_links = text_with_links[
            #         #                   :start] + f'<span style="background-color: {color}">{entity_text}</span>' + text_with_links[
            #         #                                                                                               end:]
            #         # offset += len(
            #         #     f'<span style="background-color: {color}">{entity_text}</span>') - (end - start)


            # Update state with the DataFrame
            state["df_annotated_combined_dict"] = df_annotated_combined.to_dict()

            if 'ALLURIScontext' in df_annotated_combined.columns:
                # words_for_dropdown = df_annotated_combined[
                #     df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [])][
                #     'word'].unique().tolist()
                words_for_dropdown = df_annotated_combined[df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [] and (isinstance(x, list) and len(x) > 0) and (isinstance(x, list) and (not (len(x) == 1 and not str(x[0]).strip())) ))]['word'].unique().tolist()
                words_for_dropdown = list({entry.lower(): entry for entry in words_for_dropdown}.values())
                words_for_dropdown.insert(0, "")
            else:
                words_for_dropdown = []

            html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text_with_links}</div>"
            state["html_output"] = html_output

            #return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state
            return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state, gr.update(choices=words_for_dropdown), ""

        else:
            html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
            state["html_output"] = html_output
            return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state, [], ""

    else:

        html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
        state["html_output"] = html_output
        return {"text": text, "entities": []}, html_output, state, [], ""


# "FacebookAI/xlm-roberta-large-finetuned-conll03-english",  "Babelscape/wikineural-multilingual-ner",  "blaze999/Medical-NER", "urchade/gliner_large-v2.1", "urchade/gliner_large_bio-v0.1"


def update_urls(selected_word, state):
    if "df_annotated_combined_dict" in state:
    # Convert the state dictionary back into a DataFrame
        df = pd.DataFrame(state["df_annotated_combined_dict"])

        if 'ALLURIScontext' in df.columns:
            # # Filter the DataFrame to get rows where 'ALLURIScontextFromNCBO' is not empty or None
            # valid_entries = df[df['ALLURIScontext'].apply(lambda x: x is not None and x != [])]
            # # Filter the DataFrame to get rows where 'ALLURIScontext' is not None, not an empty list, and not an empty string
            valid_entries = df[df['ALLURIScontext'].apply(lambda x: x is not None and x != []  and (isinstance(x, list) and len(x) > 0) and (isinstance(x, list) and (not (len(x) == 1 and not str(x[0]).strip())) ))]

            # Check if the selected word is in the filtered DataFrame
            if selected_word in valid_entries['word'].values:
                urls = valid_entries.loc[valid_entries['word'] == selected_word, 'ALLURIScontext'].values[0]
                if 'namedEntity' in df.columns:
                    firsturlinlist = df.loc[df['word'] == selected_word, 'namedEntity']
                    firsturlinlist = firsturlinlist.iloc[0] if not firsturlinlist.empty else None
                    if firsturlinlist and firsturlinlist in urls:
                        # Remove the URL from its current position
                        urls.remove(firsturlinlist)
                        # Insert the URL at the first position
                        urls.insert(0, firsturlinlist)

                # Convert list of URLs to HTML string with clickable links
                #html_links = "<br>".join([f'<a href="https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={url}" target="_blank">{url}</a>' for url in urls])
                html_links = "<br>".join([f'<a href="https://api-vast.jrc.service.ec.europa.eu/describe//?url={url}" target="_blank">{url}</a>' for url in urls])
                return html_links
            return ""
        else:
            return""


    else:
        return ""


# demo = gr.Interface(
#     fn=nerBio,
#     inputs=[
#         gr.Textbox(label= "Input text", placeholder="Enter text here..."),
#         gr.CheckboxGroup(models_List, label="ModelsSelection", value=models_List),
#         gr.CheckboxGroup(categories_List, label="CategoriesSelection", value=categories_List),
#         gr.Slider(minimum=0, maximum=1.0, step=0.1, label="Score", value=0.7),
#         gr.Checkbox(label="Enable Named-Entity Linking (NEL)", value=False), #True False
#         #gr.CheckboxGroup(POSSIBLE_KGchoices_List, label="KGchoices Selection", value=POSSIBLE_KGchoices_List, visible=True),
#         gr.Dropdown(POSSIBLE_KGchoices_List, multiselect=True, label="KGchoices Selection", value=POSSIBLE_KGchoices_List),
#         gr.State(value={})
#     ],
#     outputs=[
#         gr.HighlightedText(label="Annotated Text"),
#         gr.HTML(label="Linked Text", show_label=True, visible=True),  #   use gr.HTML to render the annotated text with links , visible
#         gr.State(),
#         gr.Dropdown(label="Annotated Concepts", interactive=True,visible=True),
#         gr.Textbox(label="Linked Entities",interactive=False,visible=True)
#     ],
#     live=True,
#     title="BioAnnotator: Biomedical Named-Entity Recognition (NER) and Linking (NEL)",
#     description="""Interoperability – the capability of systems and organisations to cooperate across functional, sectoral and physical borders – is key for successful digital transformation.
#     The [Interoperable Europe Act](https://interoperable-europe.ec.europa.eu/interoperable-europe/interoperable-europe-act) is an EU regulation that aims to strengthen public sector interoperability and will serve as a main EC policy framework for the years to come.
#     Data exchange is vital for digital government policies, and semantic interoperability ensures systems understand each other despite different legacies and architectures.
#
#     In this demo we show in particular the *BioAnnotator*, a prototype tool performing Biomedical Named-Entity Recognition (NER) and Linking (NEL). To give it a try, please select one or more NER models and enter some text to get it processed. Please select also the entity categories you want to extract, as well as the score to use as a threshold for the NER extraction. Finally, select whether you want to perform Named-Entity Linking (NEL) and if you want to enable the filtering to some specific biomedical ontologies only (acronyms description at: https://bioportal.bioontology.org/ontologies). See also: [InventoryHealthKGs.pdf](https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/ETOHA/KGs/InventoryHealthKGs.pdf).
#     """,
#     examples=examples,
#     cache_examples=False,
#     article="""
#     **Categories Legend:**
#     - MED  | Medical
#     - LOC  | Locations
#     - PER  | Persons
#     - ORG  | Organizations
#     - MISC | Miscellanea
#     - CONC | Concepts & Ideas
#     - BIOP | Biological
#     - ACTI | Activities & Behaviors
#     - ANAT | Anatomy
#     - CHEM | Chemicals & Drugs
#     - DEVI | Devices
#     - DISO | Disorders
#     - GENE | Genes & Molecular Sequences
#     - GEOG | Geographic Areas
#     - LIVB | Living Beings
#     - OBJC | Objects
#     - OCCU | Occupations
#     - ORGA | Organizations
#     - PHEN | Phenomena
#     - PHYS | Physiology
#     - PROC | Procedures
#     """
# )


# Define the Gradio interface using Blocks
#description="This application performs biomedical named-entity recognition and linking."
with gr.Blocks(title="BioAnnotator") as demo:

    gr.Markdown("# BioAnnotator: Biomedical Named-Entity Recognition (NER) and Linking (NEL)")
    gr.Markdown("""
        This application performs biomedical named-entity recognition and linking.

        **Description:**
        *Interoperability* – the capability of systems and organisations to cooperate across functional, sectoral and physical borders – is key for successful digital transformation. 
        The [Interoperable Europe Act](https://interoperable-europe.ec.europa.eu/interoperable-europe/interoperable-europe-act) is an EU regulation that aims to strengthen public sector interoperability and will serve as a main EC policy framework for the years to come. 
        Data exchange is vital for digital government policies, and semantic interoperability ensures systems understand each other despite different legacies and architectures.

        In this demo we show in particular the *BioAnnotator*, a prototype tool performing Biomedical Named-Entity Recognition (NER) and Linking (NEL). To give it a try, please select one or more NER models and enter some text to get it processed. Please select also the entity categories you want to extract, as well as the score to use as a threshold for the NER extraction. Finally, select whether you want to perform Named-Entity Linking (NEL) and if you want to enable the filtering to some specific biomedical ontologies only (acronyms description at: https://bioportal.bioontology.org/ontologies). See also: [InventoryHealthKGs.pdf](https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/ETOHA/KGs/InventoryHealthKGs.pdf).
        """)


    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(label="Input text", placeholder="Enter text here...")
            models_selection = gr.CheckboxGroup(models_List, label="ModelsSelection", value=models_List)
            categories_selection = gr.CheckboxGroup(categories_List, label="CategoriesSelection", value=categories_List)
            score_slider = gr.Slider(minimum=0, maximum=1.0, step=0.05, label="Score", value=0.75)
            nel_checkbox = gr.Checkbox(label="Enable Named-Entity Linking (NEL)", value=False)
            kgchoices_selection = gr.Dropdown(POSSIBLE_KGchoices_List, multiselect=True, label="KGchoices Selection", value=POSSIBLE_KGchoices_List)
            state = gr.State(value={})

        with gr.Column():
            annotated_text = gr.HighlightedText(label="Annotated Text")
            linked_text = gr.HTML(label="Linked Text", show_label=True, visible=True)
            word_dropdown = gr.Dropdown(label="Annotated Concepts", show_label=True, visible=True, interactive=True)
            urls_html = gr.HTML(label="Linked Entities", show_label=True, visible=True)

    ## Define the interactions
    #text_input.change(fn=nerBio, inputs=[text_input, models_selection, categories_selection, score_slider, nel_checkbox, kgchoices_selection, state], outputs=[annotated_text, linked_text, state, word_dropdown, urls_html])
    # Define the interactions for all inputs
    inputs = [text_input, models_selection, categories_selection, score_slider, nel_checkbox, kgchoices_selection]
    for input_component in inputs:
        input_component.change(fn=nerBio,
                               inputs=[text_input, models_selection, categories_selection, score_slider, nel_checkbox,
                                       kgchoices_selection, state],
                               outputs=[annotated_text, linked_text, state, word_dropdown, urls_html])

    word_dropdown.change(fn=update_urls, inputs=[word_dropdown, state], outputs=urls_html)

    # Add examples
    gr.Examples(examples=examples,
                inputs=[text_input, models_selection, categories_selection, score_slider, nel_checkbox,
                        kgchoices_selection])

    gr.Markdown("""
        **Categories Legend:**
        - MED  | Medical
        - LOC  | Locations
        - PER  | Persons
        - ORG  | Organizations
        - MISC | Miscellanea
        - CONC | Concepts & Ideas 
        - BIOP | Biological 
        - ACTI | Activities & Behaviors
        - ANAT | Anatomy
        - CHEM | Chemicals & Drugs
        - DEVI | Devices
        - DISO | Disorders
        - GENE | Genes & Molecular Sequences
        - GEOG | Geographic Areas
        - LIVB | Living Beings
        - OBJC | Objects
        - OCCU | Occupations
        - ORGA | Organizations
        - PHEN | Phenomena
        - PHYS | Physiology
        - PROC | Procedures
        """)



demo.launch()
#demo.launch(share=True)  # Share your demo with just 1 extra parameter