Spaces:

jrc-ai
/

MultiNER-simplified

Running

App Files Files Community

jattokatarratto commited on Feb 5

Commit

c3a2666

verified ·

1 Parent(s): f20ae79

Update app.py

Browse files

Files changed (1) hide show

app.py +177 -64

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 from transformers import file_utils
 print(file_utils.default_cache_path)
@@ -16,11 +17,8 @@ from transformers.pipelines.pt_utils import KeyDataset
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from collections import Counter
-##os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
-#os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
 import torch
-#torch.cuda.empty_cache()  # Clear cache ot torch
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 print(f"Device: {device}...")
@@ -41,7 +39,7 @@ from virtuosoQueryRest import sparqlQuery
 import gradio as gr
 import re
-from common import strtobool, split_camel_case, chunk_tokens, update_nested_dict, cleanInputText, token_counter, encoding_getter, extract_words, all_words_in_list, row_to_dict_string, rescale_exponential_to_logarithmic
@@ -73,16 +71,14 @@ modelGlinerBio=None
 num_cores_Gliner_forDemo = 0  # 0 means use the GPU for Gliner !
 tokenizerGliner = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')
-POSSIBLE_KGchoices_List = ['AEO', 'BFO', 'BIM', 'BCGO', 'CL', 'CHIRO', 'CHEBI', 'DCM', 'FMA', 'GO', 'GENO',
-             'GeoSPARQL', 'HL7', 'DOID', 'HP', 'HP_O', 'IDO', 'IAO', 'ICD10', 'LOINC', 'MESH',
-             'MONDO', 'NCIT', 'NCBITAXON', 'NCBITaxon_', 'NIFCELL', 'NIFSTD', 'GML', 'OBCS', 'OCHV', 'OHPI',
-             'OPB', 'TRANS', 'PLOSTHES', 'RADLEX', 'RO', 'STY', 'SO', 'SNOMED', 'STATO',
-             'SYMP', 'FoodOn', 'UBERON', 'VO', 'EuroSciVoc']
 ONLY_Ontologies_OnBIOPORTAL = ["AEO", "BCGO", "BFO", "BIM", "CHEBI", "CHIRO", "CL", "DCM", "DOID", "FMA", "FOODON", "GENO", "GML", "GO", "GEOSPARQL", "HL7", "HP", "HP_O", "IAO", "ICD10", "IDO", "LOINC", "MESH", "MONDO", "NCBITAXON", "NCIT", "NIFCELL", "NIFSTD", "OBCS", "OCHV", "OHPI", "OPB", "PLOSTHES", "RADLEX", "OBOREL", "SNOMEDCT", "SO", "STATO", "STY", "SYMP", "PTRANS", "UBERON", "VO"]
 encod = encoding_getter('microsoft/deberta-v3-large')
 text_splitter = TokenTextSplitter(
     # separators=separators,
@@ -215,23 +211,24 @@ def process_row_BioPortal_api(args, key_bioportal, row):
         onto_clauses = ""
         for choice in args.KG_restriction:
-            if choice.upper() == "SNOMED":
                 choice="SNOMEDCT"
-            elif choice.upper() == "RO":
                 choice = "OBOREL"
-            elif choice.upper() == "TRANS":
                 choice = "PTRANS"
-            elif choice.upper() == "FoodOn":
                 choice = "FOODON"
-            elif choice.upper() == "GeoSPARQL":
                 choice = "GEOSPARQL"
-            # elif choice.upper() == "NCBITAXON":
             #     choice = "NCBITAXON,NCBITaxon_"
-            elif choice.upper() == "NCBITaxon_":
                 choice = "NCBITAXON"
             if choice in ONLY_Ontologies_OnBIOPORTAL:
                 onto_clauses=onto_clauses+choice+","
         if onto_clauses and onto_clauses[-1] == ",":
             onto_clauses=onto_clauses[:-1]
@@ -366,7 +363,7 @@ def annotate(df, args, pipeInner, tokenizerGliner, modelGliner, modelGlinerBio,
         #    with open(fkeyname) as f:
         #        key_bioportal = f.read()
         key_bioportal = os.environ['key_bioportal']
         df_annot = pd.DataFrame()
         for drm_idx, row in tqdm(df.iterrows()):
             df_BioPortalAnnotation=process_row_BioPortal_api(args, key_bioportal, row)
@@ -403,9 +400,9 @@ def annotate(df, args, pipeInner, tokenizerGliner, modelGliner, modelGlinerBio,
                 if "semantic_groups" not in df_max_score_biop.columns:
                     # Drop the '@id' column
-                    df_max_score_biop["semantic_groups"] = None
-                # Specify the columns you want to keep
                 columns_to_keep = ["score", "from", "to", "prefLabel", "text", "semantic_groups", "@id", "ALLURIScontextFromNCBO"]
                 # Subset the dataframe to keep only the specified columns
@@ -744,6 +741,21 @@ def entitiesFusion(df_annotated, args):
         logging.error(
             f'FAILED to extract json results\n\tError: {err}\nLeaving it as a single column then and not decompressing! Have a check...')
     #delete all the rows with score smaller than entities_filter_threshold:
     if args.entities_filter_threshold > 0:
         df_annotated = df_annotated[df_annotated['score'] >= args.entities_filter_threshold]
@@ -754,8 +766,8 @@ def entitiesFusion(df_annotated, args):
     # df_annotated = df_annotated[(df_annotated['ToLink'] == df_annotated['word']) | df_annotated['ToLink'].isna()]
     # in all the rows having a value not null for the column "ToLink", compare this value to that of the column "word". If they are different, set the value in "ToLink" to None
-    #df_annotated.loc[
-    #    (~df_annotated['ToLink'].isnull()) & (df_annotated['ToLink'] != df_annotated['word']), 'ToLink'] = None
     df_annotated.loc[
         (~df_annotated['ToLink'].isnull()) & (
                     df_annotated['ToLink'].str.casefold() != df_annotated['word'].str.casefold()), 'ToLink'] = None
@@ -931,7 +943,7 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
     #    with open(fkeyname) as f:
     #        key_bioportal = f.read()
     key_bioportal = os.environ['key_bioportal']
     # Check if args.KG_restriction exists and is not empty
     if getattr(args, 'KG_restriction', None):
@@ -961,24 +973,37 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
         ### this is for Bioportal url api:
         onto_clauses = ""
         for choice in args.KG_restriction:
-            if choice.upper() == "SNOMED":
                 choice="SNOMEDCT"
-            elif choice.upper() == "RO":
                 choice = "OBOREL"
-            elif choice.upper() == "TRANS":
                 choice = "PTRANS"
-            elif choice.upper() == "FoodOn":
                 choice = "FOODON"
-            elif choice.upper() == "GeoSPARQL":
                 choice = "GEOSPARQL"
-            # elif choice.upper() == "NCBITAXON":
             #     choice = "NCBITAXON,NCBITaxon_"
-            elif choice.upper() == "NCBITaxon_":
                 choice = "NCBITAXON"
             if choice in ONLY_Ontologies_OnBIOPORTAL:
-                onto_clauses = onto_clauses + choice + ","
         if onto_clauses and onto_clauses[-1] == ",":
             onto_clauses = onto_clauses[:-1]
@@ -1286,12 +1311,13 @@ def getLinearTextualContextFromTriples(word,labelTriplesLIST, text_splitter, arg
     return contextText, map_query_input_output
 #@mem.cache
-def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=None, iALLURIScontextFromNCBO=None):
     if strtobool(args.debug):
         print(f"\n----- Starting virtuoso_api_call for {word}")
     word = word.lower()
     endpoint = 'https://api-vast.jrc.service.ec.europa.eu/sparql'
     VirtuosoUsername = 'dba'
@@ -1340,7 +1366,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
     else:
         try:
-            entityBioeUrl, ALLURIScontext, cache_map_virtuoso =  getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso, endpoint, VirtuosoUsername, contextWordVirtuoso, UseBioportalForLinking=True )
             if ALLURIScontext and isinstance(ALLURIScontext, list):
                 ALLURIScontext = list(set(ALLURIScontext))
         except Exception as err:
@@ -1352,7 +1378,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
             return None, None, None, None, None, None, cache_map_virtuoso, load_map_query_input_output
     if entityBioeUrl:
@@ -1520,7 +1546,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
                                                                                                                 endpoint,
                                                                                                                 VirtuosoUsername,
                                                                                                                 contextWordVirtuoso,
-                                                                                                                UseBioportalForLinking=True)
                             if ALLURIScontext and isinstance(ALLURIScontext, list):
                                 ALLURIScontext = list(set(ALLURIScontext))
@@ -1538,7 +1564,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
                         # Print the error message to stderr
                         print("THIS CASE SHOULD NEVER HAPPEN NOW!!!! Check what's happening...exiting now...")
                         # Exit the program with a non-zero status code (commonly used to indicate an error)
                     else:
@@ -1714,6 +1740,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
 def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output):
     result = None
@@ -1736,8 +1763,7 @@ def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonam
             result, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames = geonames_api_call(row['word'], args, key_geonames, cache_map_geonames)
-        else:
-            if row['IsBio'] == 1:
                 # Check if '@id' column exists in df_Extract
                 iiid = None
@@ -1756,7 +1782,37 @@ def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonam
                 if strtobool(args.debug):
                     print(f"\n----- isBio COMPUTING ... {row['word']} IN THE TEXT:")
                     print(row[args.source_column])
-                result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO)
     else:
         if (row['IsBio'] == 1) or ( (pd.isnull(row["IsBio"]) or row["IsBio"] == '' or row['IsBio'] == 0 or row["IsBio"] is None) and (row['entity_group'] == "MISC") ):
@@ -1780,7 +1836,7 @@ def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonam
                         iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO))
             result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
-                row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO)
     return result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames, cache_map_virtuoso, load_map_query_input_output, row.name
@@ -1889,9 +1945,9 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
     parser.add_argument("--num_cores_Gliner", type=int, default=num_cores_Gliner_forDemo, help="parallel processing for Gliner annotation")  # 0 means use the GPU for Gliner !
     parser.add_argument("--entity_linking", type=str, default=EnableNEL, help="whether to make entities linking or not")
-    parser.add_argument("--geonameskey_filename", type=str, default="", help="file location where it is stored the geonames api key")
-    parser.add_argument("--virtuosokey_filename", type=str, default="", help="file location where it is stored the virtuoso endpoint dba pwd")
-    parser.add_argument("--bioportalkey_filename", type=str, default="", help="file location where it is stored the NCBO  BioPortal api key")
     # consose 20250205:
     # KGchoices = None
@@ -1910,7 +1966,7 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
     #     parser.add_argument("--USE_CACHE", type=str, default="False",
     #                         help="whether to use cache for the NER and NEL tasks or not")
     parser.add_argument("--USE_CACHE", type=str, default="False", help="whether to use cache for the NER and NEL tasks or not")
     parser.add_argument("--num_cores_eLinking", type=int, default=1, help="parallel processing for the entity linking process")
     parser.add_argument("--computeEntityContext", type=str, default="False",
@@ -1926,7 +1982,7 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
     args = parser.parse_args()
     #print("Are all models in any row of the 'model' column, case-insensitively?", all_models_in_any_row)
     #if (not history_dict) or (history_dict[args.source_column][0] != text) or (all_models_in_any_row == False):
@@ -1998,21 +2054,24 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
         df_annotated = history.copy()
-    if not df_annotated.empty:
-        # filter now per models selection
-        df_annotated = df_annotated[df_annotated['model'].str.lower().isin([model.lower() for model in ModelsSelection])]
-        if df_annotated.empty:
-            html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
-            return {"text": text, "entities": []}, html_output, history.to_dict()
-        df_annotated_combined = entitiesFusion(df_annotated,args)
-        if df_annotated_combined.empty:
-            html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
-            return {"text": text, "entities": []}, html_output, history.to_dict()
-        else:
-            df_annotated_combined = is_cross_inside(df_annotated_combined, args, 0.999)  #I cut all the cross inside with the 0.99. to avoid the linking
         cache_prefix_fp = "LLMQUERYNER"
@@ -2063,6 +2122,60 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
             #        key_virtuoso = f.read()
             key_virtuoso = os.environ['key_virtuoso']
             df_annotated_combined, cache_map_geonames_AFTER, cache_map_virtuoso_AFTER, load_map_query_input_output_AFTER = elinking(df_annotated_combined,
                                                                                                                                  text_splitter, args, key_geonames,
                                                                                                                                  cache_map_geonames,
@@ -2255,5 +2368,5 @@ demo = gr.Interface(
-#demo.launch()
-demo.launch(share=True)  # Share your demo with just 1 extra parameter

 import os
 from transformers import file_utils
 print(file_utils.default_cache_path)
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from collections import Counter
 import torch
+torch.cuda.empty_cache()  # Clear cache ot torch
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 print(f"Device: {device}...")
 import gradio as gr
 import re
+from common import strtobool, split_camel_case, chunk_tokens, update_nested_dict, cleanInputText, token_counter, encoding_getter, extract_words, all_words_in_list, row_to_dict_string, strip_quotes, rescale_exponential_to_logarithmic
 num_cores_Gliner_forDemo = 0  # 0 means use the GPU for Gliner !
 tokenizerGliner = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')
+POSSIBLE_KGchoices_List = ["AEO", "BFO", "BIM", "BCGO", "CL", "CHIRO", "CHEBI", "DCM", "FMA", "GO", "GENO",
+             "GeoSPARQL", "HL7", "DOID", "HP", "HP_O", "IDO", "IAO", "ICD10", "LOINC", "MESH",
+             "MONDO", "NCIT", "NCBITAXON", "NCBITaxon_", "NIFCELL", "NIFSTD", "GML", "OBCS", "OCHV", "OHPI",
+             "OPB", "TRANS", "PLOSTHES", "RADLEX", "RO", "STY", "SO", "SNOMED", "STATO",
+             "SYMP", "FoodOn", "UBERON", "VO", "EuroSciVoc"]
 ONLY_Ontologies_OnBIOPORTAL = ["AEO", "BCGO", "BFO", "BIM", "CHEBI", "CHIRO", "CL", "DCM", "DOID", "FMA", "FOODON", "GENO", "GML", "GO", "GEOSPARQL", "HL7", "HP", "HP_O", "IAO", "ICD10", "IDO", "LOINC", "MESH", "MONDO", "NCBITAXON", "NCIT", "NIFCELL", "NIFSTD", "OBCS", "OCHV", "OHPI", "OPB", "PLOSTHES", "RADLEX", "OBOREL", "SNOMEDCT", "SO", "STATO", "STY", "SYMP", "PTRANS", "UBERON", "VO"]
 encod = encoding_getter('microsoft/deberta-v3-large')
 text_splitter = TokenTextSplitter(
     # separators=separators,
         onto_clauses = ""
         for choice in args.KG_restriction:
+            if choice == "SNOMED":
                 choice="SNOMEDCT"
+            elif choice == "RO":
                 choice = "OBOREL"
+            elif choice == "TRANS":
                 choice = "PTRANS"
+            elif choice == "FoodOn":
                 choice = "FOODON"
+            elif choice == "GeoSPARQL":
                 choice = "GEOSPARQL"
+            # elif choice == "NCBITAXON":
             #     choice = "NCBITAXON,NCBITaxon_"
+            elif choice == "NCBITaxon_":
                 choice = "NCBITAXON"
             if choice in ONLY_Ontologies_OnBIOPORTAL:
                 onto_clauses=onto_clauses+choice+","
         if onto_clauses and onto_clauses[-1] == ",":
             onto_clauses=onto_clauses[:-1]
         #    with open(fkeyname) as f:
         #        key_bioportal = f.read()
         key_bioportal = os.environ['key_bioportal']
         df_annot = pd.DataFrame()
         for drm_idx, row in tqdm(df.iterrows()):
             df_BioPortalAnnotation=process_row_BioPortal_api(args, key_bioportal, row)
                 if "semantic_groups" not in df_max_score_biop.columns:
                     # Drop the '@id' column
+                    df_max_score_biop["semantic_groups"] = None
+                    # Specify the columns you want to keep
                 columns_to_keep = ["score", "from", "to", "prefLabel", "text", "semantic_groups", "@id", "ALLURIScontextFromNCBO"]
                 # Subset the dataframe to keep only the specified columns
         logging.error(
             f'FAILED to extract json results\n\tError: {err}\nLeaving it as a single column then and not decompressing! Have a check...')
+    #
+    # Delete all the rows where EXACT MATCHING NOT MET:
+    # Apply the conditions
+    condition_to_delete = (
+            df_annotated['ContextToAnnotate'].str.startswith('"') &
+            df_annotated['ContextToAnnotate'].str.endswith('"') &
+            (df_annotated['ContextToAnnotate'].apply(strip_quotes).str.lower() != df_annotated['word'].str.lower())
+    )
+    # Now Filter out the rows where condition_to_delete is True
+    df_annotated = df_annotated[~condition_to_delete].copy()
+    #
     #delete all the rows with score smaller than entities_filter_threshold:
     if args.entities_filter_threshold > 0:
         df_annotated = df_annotated[df_annotated['score'] >= args.entities_filter_threshold]
     # df_annotated = df_annotated[(df_annotated['ToLink'] == df_annotated['word']) | df_annotated['ToLink'].isna()]
     # in all the rows having a value not null for the column "ToLink", compare this value to that of the column "word". If they are different, set the value in "ToLink" to None
+    # df_annotated.loc[
+    #     (~df_annotated['ToLink'].isnull()) & (df_annotated['ToLink'] != df_annotated['word']), 'ToLink'] = None
     df_annotated.loc[
         (~df_annotated['ToLink'].isnull()) & (
                     df_annotated['ToLink'].str.casefold() != df_annotated['word'].str.casefold()), 'ToLink'] = None
     #    with open(fkeyname) as f:
     #        key_bioportal = f.read()
     key_bioportal = os.environ['key_bioportal']
     # Check if args.KG_restriction exists and is not empty
     if getattr(args, 'KG_restriction', None):
         ### this is for Bioportal url api:
         onto_clauses = ""
+        # for choice in args.KG_restriction:
+        #     if choice == "SNOMEDCT":
+        #         choice = "SNOMED"
+        #     elif choice == "OBOREL":
+        #         choice = "RO"
+        #     elif choice == "PTRANS":
+        #         choice = "TRANS"
+        #     elif choice == "FOODON":
+        #         choice = "FoodOn"
+        #     elif choice == "GEOSPARQL":
+        #         choice = "GeoSPARQL"
+        #     elif choice == "NCBITAXON":
+        #         choice = "NCBITAXON,NCBITaxon_"
+        #     onto_clauses = onto_clauses + choice + ","
         for choice in args.KG_restriction:
+            if choice == "SNOMED":
                 choice="SNOMEDCT"
+            elif choice == "RO":
                 choice = "OBOREL"
+            elif choice == "TRANS":
                 choice = "PTRANS"
+            elif choice == "FoodOn":
                 choice = "FOODON"
+            elif choice == "GeoSPARQL":
                 choice = "GEOSPARQL"
+            # elif choice == "NCBITAXON":
             #     choice = "NCBITAXON,NCBITaxon_"
+            elif choice == "NCBITaxon_":
                 choice = "NCBITAXON"
             if choice in ONLY_Ontologies_OnBIOPORTAL:
+                onto_clauses=onto_clauses+choice+","
         if onto_clauses and onto_clauses[-1] == ",":
             onto_clauses = onto_clauses[:-1]
     return contextText, map_query_input_output
 #@mem.cache
+def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=None, iALLURIScontextFromNCBO=None,UseBioportalForLinking=True):
     if strtobool(args.debug):
         print(f"\n----- Starting virtuoso_api_call for {word}")
     word = word.lower()
+    word = strip_quotes(word)
     endpoint = 'https://api-vast.jrc.service.ec.europa.eu/sparql'
     VirtuosoUsername = 'dba'
     else:
         try:
+            entityBioeUrl, ALLURIScontext, cache_map_virtuoso =  getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso, endpoint, VirtuosoUsername, contextWordVirtuoso, UseBioportalForLinking=UseBioportalForLinking )
             if ALLURIScontext and isinstance(ALLURIScontext, list):
                 ALLURIScontext = list(set(ALLURIScontext))
         except Exception as err:
             return None, None, None, None, None, None, cache_map_virtuoso, load_map_query_input_output
     if entityBioeUrl:
                                                                                                                 endpoint,
                                                                                                                 VirtuosoUsername,
                                                                                                                 contextWordVirtuoso,
+                                                                                                                UseBioportalForLinking=UseBioportalForLinking)
                             if ALLURIScontext and isinstance(ALLURIScontext, list):
                                 ALLURIScontext = list(set(ALLURIScontext))
                         # Print the error message to stderr
                         print("THIS CASE SHOULD NEVER HAPPEN NOW!!!! Check what's happening...exiting now...")
                         # Exit the program with a non-zero status code (commonly used to indicate an error)
+                        sys.exit(1)
                     else:
 def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output):
     result = None
             result, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames = geonames_api_call(row['word'], args, key_geonames, cache_map_geonames)
+        elif row['IsBio'] == 1:
                 # Check if '@id' column exists in df_Extract
                 iiid = None
                 if strtobool(args.debug):
                     print(f"\n----- isBio COMPUTING ... {row['word']} IN THE TEXT:")
                     print(row[args.source_column])
+                result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO, UseBioportalForLinking=True)
+        else:
+            if row['model'] == "Forced":
+                # Check if '@id' column exists in df_Extract
+                iiid = None
+                # Check if the '@id' exists in the Series
+                if '@id' in row:
+                    # Check if the value is not None or NaN
+                    if row['@id'] is not None and not pd.isna(row['@id']):
+                        # Assign the value to the variable iiid
+                        iiid = row['@id']
+                iiiALLURIScontextFromNCBO = None
+                if 'ALLURIScontextFromNCBO' in row:
+                    if row['ALLURIScontextFromNCBO'] is not None and isinstance(row['ALLURIScontextFromNCBO'],
+                                                                                list):  # and not pd.isna(row['ALLURIScontextFromNCBO']):
+                        iiiALLURIScontextFromNCBO = row['ALLURIScontextFromNCBO']
+                        iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO))
+                if strtobool(args.debug):
+                    print(f"\n----- isForced COMPUTING ... {row['word']} IN THE TEXT:")
+                    print(row[args.source_column])
+                result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
+                    row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output,
+                    id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO,UseBioportalForLinking=True)
+                if not result:  #try annotation without bioportal
+                    result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
+                        row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output,
+                        id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO, UseBioportalForLinking=False)
     else:
         if (row['IsBio'] == 1) or ( (pd.isnull(row["IsBio"]) or row["IsBio"] == '' or row['IsBio'] == 0 or row["IsBio"] is None) and (row['entity_group'] == "MISC") ):
                         iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO))
             result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
+                row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO,UseBioportalForLinking=True)
     return result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames, cache_map_virtuoso, load_map_query_input_output, row.name
     parser.add_argument("--num_cores_Gliner", type=int, default=num_cores_Gliner_forDemo, help="parallel processing for Gliner annotation")  # 0 means use the GPU for Gliner !
     parser.add_argument("--entity_linking", type=str, default=EnableNEL, help="whether to make entities linking or not")
+    parser.add_argument("--geonameskey_filename", type=str, default="GEONAMES-API.key", help="file location where it is stored the geonames api key")
+    parser.add_argument("--virtuosokey_filename", type=str, default="VIRTUOSO-dba.key", help="file location where it is stored the virtuoso endpoint dba pwd")
+    parser.add_argument("--bioportalkey_filename", type=str, default="NCBO-BioPortal.key", help="file location where it is stored the NCBO  BioPortal api key")
     # consose 20250205:
     # KGchoices = None
     #     parser.add_argument("--USE_CACHE", type=str, default="False",
     #                         help="whether to use cache for the NER and NEL tasks or not")
     parser.add_argument("--USE_CACHE", type=str, default="False", help="whether to use cache for the NER and NEL tasks or not")
     parser.add_argument("--num_cores_eLinking", type=int, default=1, help="parallel processing for the entity linking process")
     parser.add_argument("--computeEntityContext", type=str, default="False",
     args = parser.parse_args()
+    df_ToAnnotate = pd.DataFrame()
     #print("Are all models in any row of the 'model' column, case-insensitively?", all_models_in_any_row)
     #if (not history_dict) or (history_dict[args.source_column][0] != text) or (all_models_in_any_row == False):
         df_annotated = history.copy()
+    quoted_text = text.startswith('"') & text.endswith('"')
+    if (not df_annotated.empty) or quoted_text:
+        if (not df_annotated.empty):
+            # filter now per models selection
+            df_annotated = df_annotated[df_annotated['model'].str.lower().isin([model.lower() for model in ModelsSelection])]
+            if df_annotated.empty and quoted_text==False:
+                html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
+                return {"text": text, "entities": []}, html_output, history.to_dict()
+        df_annotated_combined = pd.DataFrame()
+        if (not df_annotated.empty):
+            df_annotated_combined = entitiesFusion(df_annotated,args)
+            if df_annotated_combined.empty and quoted_text==False:
+                html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
+                return {"text": text, "entities": []}, html_output, history.to_dict()
+            else:
+                df_annotated_combined = is_cross_inside(df_annotated_combined, args, 0.999)  #I cut all the cross inside with the 0.99. to avoid the linking
         cache_prefix_fp = "LLMQUERYNER"
             #        key_virtuoso = f.read()
             key_virtuoso = os.environ['key_virtuoso']
+            # Here for the EXACT MATCHING "" - if the desired term has not been identified in the NER, add to the dataframe:
+            if df_ToAnnotate.empty:
+                df_ToAnnotate = pd.DataFrame({"ToLink": [None], args.source_column: [text]})
+                if "SentenceRef" not in df_ToAnnotate.columns:
+                    df_ToAnnotate["SentenceRef"] = None
+                    df_ToAnnotate = df_ToAnnotate[['SentenceRef'] + [col for col in df_ToAnnotate.columns if
+                                                                     col != 'SentenceRef']]  # this moves it to the first position
+                df_ToAnnotate['SentenceRef'] = df_ToAnnotate.index + 1
+                df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].argsort().groupby(
+                    df_ToAnnotate[args.source_column]).transform('min').astype(int)
+                df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].rank(method='dense').astype(int)
+            # Define the condition to find missing SentenceRefs
+            missing_sentence_refs = ~df_ToAnnotate['SentenceRef'].isin(df_annotated_combined['SentenceRef'])
+            # Define the condition to check if ContextToAnnotate starts and ends with quotes
+            quoted_context = df_ToAnnotate['ContextToAnnotate'].str.startswith('"') & df_ToAnnotate[
+                'ContextToAnnotate'].str.endswith('"')
+            # Combine both conditions
+            condition = missing_sentence_refs & quoted_context
+            # Select rows from df_ToAnnotate that meet the condition
+            rows_to_add = df_ToAnnotate[condition]
+            rows_to_add['model'] = "Forced"
+            rows_to_add['entity_group'] = "MISC"
+            rows_to_add['word'] = rows_to_add['ContextToAnnotate']
+            rows_to_add['word'] = rows_to_add['ContextToAnnotate'].apply(strip_quotes)
+            rows_to_add['score'] = 1.0
+            rows_to_add['start'] = int(1)
+            rows_to_add['end'] = rows_to_add['word'].apply(len) + int(1)
+            rows_to_add['IsGeo'] = None
+            rows_to_add['IsBio'] = None
+            rows_to_add['IsCrossInside'] = 0.0
+            if df_annotated_combined.empty:
+                df_annotated_combined = pd.DataFrame(columns=df_ToAnnotate.columns)
+            # Append these rows to df_annotated_combined
+            df_annotated_combined = pd.concat([df_annotated_combined, rows_to_add], ignore_index=True)
+            df_annotated_combined['start'] = df_annotated_combined['start'].astype(int)
+            df_annotated_combined['end'] = df_annotated_combined['end'].astype(int)
+            df_annotated_combined = df_annotated_combined.sort_values(
+                by=['SentenceRef', 'start', 'ToLink', 'word', 'score'],
+                ascending=[True, True, True, True, False])
+            # Now df_annotated_combined contains the additional rows
             df_annotated_combined, cache_map_geonames_AFTER, cache_map_virtuoso_AFTER, load_map_query_input_output_AFTER = elinking(df_annotated_combined,
                                                                                                                                  text_splitter, args, key_geonames,
                                                                                                                                  cache_map_geonames,
+demo.launch()
+#demo.launch(share=True)  # Share your demo with just 1 extra parameter