jattokatarratto commited on
Commit
c3a2666
·
verified ·
1 Parent(s): f20ae79

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +177 -64
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
 
 
3
  from transformers import file_utils
4
  print(file_utils.default_cache_path)
5
 
@@ -16,11 +17,8 @@ from transformers.pipelines.pt_utils import KeyDataset
16
  from concurrent.futures import ThreadPoolExecutor, as_completed
17
  from collections import Counter
18
 
19
- ##os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
20
- #os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
21
-
22
  import torch
23
- #torch.cuda.empty_cache() # Clear cache ot torch
24
 
25
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
26
  print(f"Device: {device}...")
@@ -41,7 +39,7 @@ from virtuosoQueryRest import sparqlQuery
41
  import gradio as gr
42
  import re
43
 
44
- from common import strtobool, split_camel_case, chunk_tokens, update_nested_dict, cleanInputText, token_counter, encoding_getter, extract_words, all_words_in_list, row_to_dict_string, rescale_exponential_to_logarithmic
45
 
46
 
47
 
@@ -73,16 +71,14 @@ modelGlinerBio=None
73
  num_cores_Gliner_forDemo = 0 # 0 means use the GPU for Gliner !
74
  tokenizerGliner = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')
75
 
76
-
77
- POSSIBLE_KGchoices_List = ['AEO', 'BFO', 'BIM', 'BCGO', 'CL', 'CHIRO', 'CHEBI', 'DCM', 'FMA', 'GO', 'GENO',
78
- 'GeoSPARQL', 'HL7', 'DOID', 'HP', 'HP_O', 'IDO', 'IAO', 'ICD10', 'LOINC', 'MESH',
79
- 'MONDO', 'NCIT', 'NCBITAXON', 'NCBITaxon_', 'NIFCELL', 'NIFSTD', 'GML', 'OBCS', 'OCHV', 'OHPI',
80
- 'OPB', 'TRANS', 'PLOSTHES', 'RADLEX', 'RO', 'STY', 'SO', 'SNOMED', 'STATO',
81
- 'SYMP', 'FoodOn', 'UBERON', 'VO', 'EuroSciVoc']
82
 
83
  ONLY_Ontologies_OnBIOPORTAL = ["AEO", "BCGO", "BFO", "BIM", "CHEBI", "CHIRO", "CL", "DCM", "DOID", "FMA", "FOODON", "GENO", "GML", "GO", "GEOSPARQL", "HL7", "HP", "HP_O", "IAO", "ICD10", "IDO", "LOINC", "MESH", "MONDO", "NCBITAXON", "NCIT", "NIFCELL", "NIFSTD", "OBCS", "OCHV", "OHPI", "OPB", "PLOSTHES", "RADLEX", "OBOREL", "SNOMEDCT", "SO", "STATO", "STY", "SYMP", "PTRANS", "UBERON", "VO"]
84
 
85
-
86
  encod = encoding_getter('microsoft/deberta-v3-large')
87
  text_splitter = TokenTextSplitter(
88
  # separators=separators,
@@ -215,23 +211,24 @@ def process_row_BioPortal_api(args, key_bioportal, row):
215
 
216
  onto_clauses = ""
217
  for choice in args.KG_restriction:
218
- if choice.upper() == "SNOMED":
219
  choice="SNOMEDCT"
220
- elif choice.upper() == "RO":
221
  choice = "OBOREL"
222
- elif choice.upper() == "TRANS":
223
  choice = "PTRANS"
224
- elif choice.upper() == "FoodOn":
225
  choice = "FOODON"
226
- elif choice.upper() == "GeoSPARQL":
227
  choice = "GEOSPARQL"
228
- # elif choice.upper() == "NCBITAXON":
229
  # choice = "NCBITAXON,NCBITaxon_"
230
- elif choice.upper() == "NCBITaxon_":
231
  choice = "NCBITAXON"
232
  if choice in ONLY_Ontologies_OnBIOPORTAL:
233
  onto_clauses=onto_clauses+choice+","
234
 
 
235
  if onto_clauses and onto_clauses[-1] == ",":
236
  onto_clauses=onto_clauses[:-1]
237
 
@@ -366,7 +363,7 @@ def annotate(df, args, pipeInner, tokenizerGliner, modelGliner, modelGlinerBio,
366
  # with open(fkeyname) as f:
367
  # key_bioportal = f.read()
368
  key_bioportal = os.environ['key_bioportal']
369
-
370
  df_annot = pd.DataFrame()
371
  for drm_idx, row in tqdm(df.iterrows()):
372
  df_BioPortalAnnotation=process_row_BioPortal_api(args, key_bioportal, row)
@@ -403,9 +400,9 @@ def annotate(df, args, pipeInner, tokenizerGliner, modelGliner, modelGlinerBio,
403
 
404
  if "semantic_groups" not in df_max_score_biop.columns:
405
  # Drop the '@id' column
406
- df_max_score_biop["semantic_groups"] = None
407
-
408
- # Specify the columns you want to keep
409
  columns_to_keep = ["score", "from", "to", "prefLabel", "text", "semantic_groups", "@id", "ALLURIScontextFromNCBO"]
410
 
411
  # Subset the dataframe to keep only the specified columns
@@ -744,6 +741,21 @@ def entitiesFusion(df_annotated, args):
744
  logging.error(
745
  f'FAILED to extract json results\n\tError: {err}\nLeaving it as a single column then and not decompressing! Have a check...')
746
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
747
  #delete all the rows with score smaller than entities_filter_threshold:
748
  if args.entities_filter_threshold > 0:
749
  df_annotated = df_annotated[df_annotated['score'] >= args.entities_filter_threshold]
@@ -754,8 +766,8 @@ def entitiesFusion(df_annotated, args):
754
  # df_annotated = df_annotated[(df_annotated['ToLink'] == df_annotated['word']) | df_annotated['ToLink'].isna()]
755
 
756
  # in all the rows having a value not null for the column "ToLink", compare this value to that of the column "word". If they are different, set the value in "ToLink" to None
757
- #df_annotated.loc[
758
- # (~df_annotated['ToLink'].isnull()) & (df_annotated['ToLink'] != df_annotated['word']), 'ToLink'] = None
759
  df_annotated.loc[
760
  (~df_annotated['ToLink'].isnull()) & (
761
  df_annotated['ToLink'].str.casefold() != df_annotated['word'].str.casefold()), 'ToLink'] = None
@@ -931,7 +943,7 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
931
  # with open(fkeyname) as f:
932
  # key_bioportal = f.read()
933
  key_bioportal = os.environ['key_bioportal']
934
-
935
  # Check if args.KG_restriction exists and is not empty
936
  if getattr(args, 'KG_restriction', None):
937
 
@@ -961,24 +973,37 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
961
 
962
  ### this is for Bioportal url api:
963
  onto_clauses = ""
964
-
 
 
 
 
 
 
 
 
 
 
 
 
 
965
  for choice in args.KG_restriction:
966
- if choice.upper() == "SNOMED":
967
  choice="SNOMEDCT"
968
- elif choice.upper() == "RO":
969
  choice = "OBOREL"
970
- elif choice.upper() == "TRANS":
971
  choice = "PTRANS"
972
- elif choice.upper() == "FoodOn":
973
  choice = "FOODON"
974
- elif choice.upper() == "GeoSPARQL":
975
  choice = "GEOSPARQL"
976
- # elif choice.upper() == "NCBITAXON":
977
  # choice = "NCBITAXON,NCBITaxon_"
978
- elif choice.upper() == "NCBITaxon_":
979
  choice = "NCBITAXON"
980
  if choice in ONLY_Ontologies_OnBIOPORTAL:
981
- onto_clauses = onto_clauses + choice + ","
982
 
983
  if onto_clauses and onto_clauses[-1] == ",":
984
  onto_clauses = onto_clauses[:-1]
@@ -1286,12 +1311,13 @@ def getLinearTextualContextFromTriples(word,labelTriplesLIST, text_splitter, arg
1286
  return contextText, map_query_input_output
1287
 
1288
  #@mem.cache
1289
- def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=None, iALLURIScontextFromNCBO=None):
1290
 
1291
  if strtobool(args.debug):
1292
  print(f"\n----- Starting virtuoso_api_call for {word}")
1293
 
1294
  word = word.lower()
 
1295
 
1296
  endpoint = 'https://api-vast.jrc.service.ec.europa.eu/sparql'
1297
  VirtuosoUsername = 'dba'
@@ -1340,7 +1366,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1340
  else:
1341
 
1342
  try:
1343
- entityBioeUrl, ALLURIScontext, cache_map_virtuoso = getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso, endpoint, VirtuosoUsername, contextWordVirtuoso, UseBioportalForLinking=True )
1344
  if ALLURIScontext and isinstance(ALLURIScontext, list):
1345
  ALLURIScontext = list(set(ALLURIScontext))
1346
  except Exception as err:
@@ -1352,7 +1378,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1352
 
1353
  return None, None, None, None, None, None, cache_map_virtuoso, load_map_query_input_output
1354
 
1355
-
1356
 
1357
  if entityBioeUrl:
1358
 
@@ -1520,7 +1546,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1520
  endpoint,
1521
  VirtuosoUsername,
1522
  contextWordVirtuoso,
1523
- UseBioportalForLinking=True)
1524
  if ALLURIScontext and isinstance(ALLURIScontext, list):
1525
  ALLURIScontext = list(set(ALLURIScontext))
1526
 
@@ -1538,7 +1564,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1538
  # Print the error message to stderr
1539
  print("THIS CASE SHOULD NEVER HAPPEN NOW!!!! Check what's happening...exiting now...")
1540
  # Exit the program with a non-zero status code (commonly used to indicate an error)
1541
-
1542
 
1543
  else:
1544
 
@@ -1714,6 +1740,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1714
 
1715
 
1716
 
 
1717
  def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output):
1718
 
1719
  result = None
@@ -1736,8 +1763,7 @@ def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonam
1736
 
1737
  result, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames = geonames_api_call(row['word'], args, key_geonames, cache_map_geonames)
1738
 
1739
- else:
1740
- if row['IsBio'] == 1:
1741
 
1742
  # Check if '@id' column exists in df_Extract
1743
  iiid = None
@@ -1756,7 +1782,37 @@ def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonam
1756
  if strtobool(args.debug):
1757
  print(f"\n----- isBio COMPUTING ... {row['word']} IN THE TEXT:")
1758
  print(row[args.source_column])
1759
- result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1760
 
1761
  else:
1762
  if (row['IsBio'] == 1) or ( (pd.isnull(row["IsBio"]) or row["IsBio"] == '' or row['IsBio'] == 0 or row["IsBio"] is None) and (row['entity_group'] == "MISC") ):
@@ -1780,7 +1836,7 @@ def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonam
1780
  iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO))
1781
 
1782
  result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
1783
- row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO)
1784
 
1785
  return result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames, cache_map_virtuoso, load_map_query_input_output, row.name
1786
 
@@ -1889,9 +1945,9 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
1889
  parser.add_argument("--num_cores_Gliner", type=int, default=num_cores_Gliner_forDemo, help="parallel processing for Gliner annotation") # 0 means use the GPU for Gliner !
1890
 
1891
  parser.add_argument("--entity_linking", type=str, default=EnableNEL, help="whether to make entities linking or not")
1892
- parser.add_argument("--geonameskey_filename", type=str, default="", help="file location where it is stored the geonames api key")
1893
- parser.add_argument("--virtuosokey_filename", type=str, default="", help="file location where it is stored the virtuoso endpoint dba pwd")
1894
- parser.add_argument("--bioportalkey_filename", type=str, default="", help="file location where it is stored the NCBO BioPortal api key")
1895
 
1896
  # consose 20250205:
1897
  # KGchoices = None
@@ -1910,7 +1966,7 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
1910
  # parser.add_argument("--USE_CACHE", type=str, default="False",
1911
  # help="whether to use cache for the NER and NEL tasks or not")
1912
  parser.add_argument("--USE_CACHE", type=str, default="False", help="whether to use cache for the NER and NEL tasks or not")
1913
-
1914
  parser.add_argument("--num_cores_eLinking", type=int, default=1, help="parallel processing for the entity linking process")
1915
 
1916
  parser.add_argument("--computeEntityContext", type=str, default="False",
@@ -1926,7 +1982,7 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
1926
 
1927
  args = parser.parse_args()
1928
 
1929
-
1930
 
1931
  #print("Are all models in any row of the 'model' column, case-insensitively?", all_models_in_any_row)
1932
  #if (not history_dict) or (history_dict[args.source_column][0] != text) or (all_models_in_any_row == False):
@@ -1998,21 +2054,24 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
1998
  df_annotated = history.copy()
1999
 
2000
 
2001
- if not df_annotated.empty:
2002
-
2003
- # filter now per models selection
2004
- df_annotated = df_annotated[df_annotated['model'].str.lower().isin([model.lower() for model in ModelsSelection])]
2005
- if df_annotated.empty:
2006
- html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
2007
- return {"text": text, "entities": []}, html_output, history.to_dict()
2008
 
2009
- df_annotated_combined = entitiesFusion(df_annotated,args)
2010
- if df_annotated_combined.empty:
2011
- html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
2012
- return {"text": text, "entities": []}, html_output, history.to_dict()
2013
- else:
2014
- df_annotated_combined = is_cross_inside(df_annotated_combined, args, 0.999) #I cut all the cross inside with the 0.99. to avoid the linking
2015
 
 
 
 
 
 
 
 
 
2016
 
2017
 
2018
  cache_prefix_fp = "LLMQUERYNER"
@@ -2063,6 +2122,60 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
2063
  # key_virtuoso = f.read()
2064
  key_virtuoso = os.environ['key_virtuoso']
2065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2066
  df_annotated_combined, cache_map_geonames_AFTER, cache_map_virtuoso_AFTER, load_map_query_input_output_AFTER = elinking(df_annotated_combined,
2067
  text_splitter, args, key_geonames,
2068
  cache_map_geonames,
@@ -2255,5 +2368,5 @@ demo = gr.Interface(
2255
 
2256
 
2257
 
2258
- #demo.launch()
2259
- demo.launch(share=True) # Share your demo with just 1 extra parameter
 
1
  import os
2
 
3
+
4
  from transformers import file_utils
5
  print(file_utils.default_cache_path)
6
 
 
17
  from concurrent.futures import ThreadPoolExecutor, as_completed
18
  from collections import Counter
19
 
 
 
 
20
  import torch
21
+ torch.cuda.empty_cache() # Clear cache ot torch
22
 
23
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
24
  print(f"Device: {device}...")
 
39
  import gradio as gr
40
  import re
41
 
42
+ from common import strtobool, split_camel_case, chunk_tokens, update_nested_dict, cleanInputText, token_counter, encoding_getter, extract_words, all_words_in_list, row_to_dict_string, strip_quotes, rescale_exponential_to_logarithmic
43
 
44
 
45
 
 
71
  num_cores_Gliner_forDemo = 0 # 0 means use the GPU for Gliner !
72
  tokenizerGliner = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')
73
 
74
+ POSSIBLE_KGchoices_List = ["AEO", "BFO", "BIM", "BCGO", "CL", "CHIRO", "CHEBI", "DCM", "FMA", "GO", "GENO",
75
+ "GeoSPARQL", "HL7", "DOID", "HP", "HP_O", "IDO", "IAO", "ICD10", "LOINC", "MESH",
76
+ "MONDO", "NCIT", "NCBITAXON", "NCBITaxon_", "NIFCELL", "NIFSTD", "GML", "OBCS", "OCHV", "OHPI",
77
+ "OPB", "TRANS", "PLOSTHES", "RADLEX", "RO", "STY", "SO", "SNOMED", "STATO",
78
+ "SYMP", "FoodOn", "UBERON", "VO", "EuroSciVoc"]
 
79
 
80
  ONLY_Ontologies_OnBIOPORTAL = ["AEO", "BCGO", "BFO", "BIM", "CHEBI", "CHIRO", "CL", "DCM", "DOID", "FMA", "FOODON", "GENO", "GML", "GO", "GEOSPARQL", "HL7", "HP", "HP_O", "IAO", "ICD10", "IDO", "LOINC", "MESH", "MONDO", "NCBITAXON", "NCIT", "NIFCELL", "NIFSTD", "OBCS", "OCHV", "OHPI", "OPB", "PLOSTHES", "RADLEX", "OBOREL", "SNOMEDCT", "SO", "STATO", "STY", "SYMP", "PTRANS", "UBERON", "VO"]
81
 
 
82
  encod = encoding_getter('microsoft/deberta-v3-large')
83
  text_splitter = TokenTextSplitter(
84
  # separators=separators,
 
211
 
212
  onto_clauses = ""
213
  for choice in args.KG_restriction:
214
+ if choice == "SNOMED":
215
  choice="SNOMEDCT"
216
+ elif choice == "RO":
217
  choice = "OBOREL"
218
+ elif choice == "TRANS":
219
  choice = "PTRANS"
220
+ elif choice == "FoodOn":
221
  choice = "FOODON"
222
+ elif choice == "GeoSPARQL":
223
  choice = "GEOSPARQL"
224
+ # elif choice == "NCBITAXON":
225
  # choice = "NCBITAXON,NCBITaxon_"
226
+ elif choice == "NCBITaxon_":
227
  choice = "NCBITAXON"
228
  if choice in ONLY_Ontologies_OnBIOPORTAL:
229
  onto_clauses=onto_clauses+choice+","
230
 
231
+
232
  if onto_clauses and onto_clauses[-1] == ",":
233
  onto_clauses=onto_clauses[:-1]
234
 
 
363
  # with open(fkeyname) as f:
364
  # key_bioportal = f.read()
365
  key_bioportal = os.environ['key_bioportal']
366
+
367
  df_annot = pd.DataFrame()
368
  for drm_idx, row in tqdm(df.iterrows()):
369
  df_BioPortalAnnotation=process_row_BioPortal_api(args, key_bioportal, row)
 
400
 
401
  if "semantic_groups" not in df_max_score_biop.columns:
402
  # Drop the '@id' column
403
+ df_max_score_biop["semantic_groups"] = None
404
+
405
+ # Specify the columns you want to keep
406
  columns_to_keep = ["score", "from", "to", "prefLabel", "text", "semantic_groups", "@id", "ALLURIScontextFromNCBO"]
407
 
408
  # Subset the dataframe to keep only the specified columns
 
741
  logging.error(
742
  f'FAILED to extract json results\n\tError: {err}\nLeaving it as a single column then and not decompressing! Have a check...')
743
 
744
+ #
745
+
746
+
747
+ # Delete all the rows where EXACT MATCHING NOT MET:
748
+ # Apply the conditions
749
+ condition_to_delete = (
750
+ df_annotated['ContextToAnnotate'].str.startswith('"') &
751
+ df_annotated['ContextToAnnotate'].str.endswith('"') &
752
+ (df_annotated['ContextToAnnotate'].apply(strip_quotes).str.lower() != df_annotated['word'].str.lower())
753
+ )
754
+
755
+ # Now Filter out the rows where condition_to_delete is True
756
+ df_annotated = df_annotated[~condition_to_delete].copy()
757
+ #
758
+
759
  #delete all the rows with score smaller than entities_filter_threshold:
760
  if args.entities_filter_threshold > 0:
761
  df_annotated = df_annotated[df_annotated['score'] >= args.entities_filter_threshold]
 
766
  # df_annotated = df_annotated[(df_annotated['ToLink'] == df_annotated['word']) | df_annotated['ToLink'].isna()]
767
 
768
  # in all the rows having a value not null for the column "ToLink", compare this value to that of the column "word". If they are different, set the value in "ToLink" to None
769
+ # df_annotated.loc[
770
+ # (~df_annotated['ToLink'].isnull()) & (df_annotated['ToLink'] != df_annotated['word']), 'ToLink'] = None
771
  df_annotated.loc[
772
  (~df_annotated['ToLink'].isnull()) & (
773
  df_annotated['ToLink'].str.casefold() != df_annotated['word'].str.casefold()), 'ToLink'] = None
 
943
  # with open(fkeyname) as f:
944
  # key_bioportal = f.read()
945
  key_bioportal = os.environ['key_bioportal']
946
+
947
  # Check if args.KG_restriction exists and is not empty
948
  if getattr(args, 'KG_restriction', None):
949
 
 
973
 
974
  ### this is for Bioportal url api:
975
  onto_clauses = ""
976
+ # for choice in args.KG_restriction:
977
+ # if choice == "SNOMEDCT":
978
+ # choice = "SNOMED"
979
+ # elif choice == "OBOREL":
980
+ # choice = "RO"
981
+ # elif choice == "PTRANS":
982
+ # choice = "TRANS"
983
+ # elif choice == "FOODON":
984
+ # choice = "FoodOn"
985
+ # elif choice == "GEOSPARQL":
986
+ # choice = "GeoSPARQL"
987
+ # elif choice == "NCBITAXON":
988
+ # choice = "NCBITAXON,NCBITaxon_"
989
+ # onto_clauses = onto_clauses + choice + ","
990
  for choice in args.KG_restriction:
991
+ if choice == "SNOMED":
992
  choice="SNOMEDCT"
993
+ elif choice == "RO":
994
  choice = "OBOREL"
995
+ elif choice == "TRANS":
996
  choice = "PTRANS"
997
+ elif choice == "FoodOn":
998
  choice = "FOODON"
999
+ elif choice == "GeoSPARQL":
1000
  choice = "GEOSPARQL"
1001
+ # elif choice == "NCBITAXON":
1002
  # choice = "NCBITAXON,NCBITaxon_"
1003
+ elif choice == "NCBITaxon_":
1004
  choice = "NCBITAXON"
1005
  if choice in ONLY_Ontologies_OnBIOPORTAL:
1006
+ onto_clauses=onto_clauses+choice+","
1007
 
1008
  if onto_clauses and onto_clauses[-1] == ",":
1009
  onto_clauses = onto_clauses[:-1]
 
1311
  return contextText, map_query_input_output
1312
 
1313
  #@mem.cache
1314
+ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=None, iALLURIScontextFromNCBO=None,UseBioportalForLinking=True):
1315
 
1316
  if strtobool(args.debug):
1317
  print(f"\n----- Starting virtuoso_api_call for {word}")
1318
 
1319
  word = word.lower()
1320
+ word = strip_quotes(word)
1321
 
1322
  endpoint = 'https://api-vast.jrc.service.ec.europa.eu/sparql'
1323
  VirtuosoUsername = 'dba'
 
1366
  else:
1367
 
1368
  try:
1369
+ entityBioeUrl, ALLURIScontext, cache_map_virtuoso = getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso, endpoint, VirtuosoUsername, contextWordVirtuoso, UseBioportalForLinking=UseBioportalForLinking )
1370
  if ALLURIScontext and isinstance(ALLURIScontext, list):
1371
  ALLURIScontext = list(set(ALLURIScontext))
1372
  except Exception as err:
 
1378
 
1379
  return None, None, None, None, None, None, cache_map_virtuoso, load_map_query_input_output
1380
 
1381
+
1382
 
1383
  if entityBioeUrl:
1384
 
 
1546
  endpoint,
1547
  VirtuosoUsername,
1548
  contextWordVirtuoso,
1549
+ UseBioportalForLinking=UseBioportalForLinking)
1550
  if ALLURIScontext and isinstance(ALLURIScontext, list):
1551
  ALLURIScontext = list(set(ALLURIScontext))
1552
 
 
1564
  # Print the error message to stderr
1565
  print("THIS CASE SHOULD NEVER HAPPEN NOW!!!! Check what's happening...exiting now...")
1566
  # Exit the program with a non-zero status code (commonly used to indicate an error)
1567
+ sys.exit(1)
1568
 
1569
  else:
1570
 
 
1740
 
1741
 
1742
 
1743
+
1744
  def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output):
1745
 
1746
  result = None
 
1763
 
1764
  result, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames = geonames_api_call(row['word'], args, key_geonames, cache_map_geonames)
1765
 
1766
+ elif row['IsBio'] == 1:
 
1767
 
1768
  # Check if '@id' column exists in df_Extract
1769
  iiid = None
 
1782
  if strtobool(args.debug):
1783
  print(f"\n----- isBio COMPUTING ... {row['word']} IN THE TEXT:")
1784
  print(row[args.source_column])
1785
+ result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO, UseBioportalForLinking=True)
1786
+
1787
+ else:
1788
+ if row['model'] == "Forced":
1789
+ # Check if '@id' column exists in df_Extract
1790
+ iiid = None
1791
+ # Check if the '@id' exists in the Series
1792
+ if '@id' in row:
1793
+ # Check if the value is not None or NaN
1794
+ if row['@id'] is not None and not pd.isna(row['@id']):
1795
+ # Assign the value to the variable iiid
1796
+ iiid = row['@id']
1797
+ iiiALLURIScontextFromNCBO = None
1798
+ if 'ALLURIScontextFromNCBO' in row:
1799
+ if row['ALLURIScontextFromNCBO'] is not None and isinstance(row['ALLURIScontextFromNCBO'],
1800
+ list): # and not pd.isna(row['ALLURIScontextFromNCBO']):
1801
+ iiiALLURIScontextFromNCBO = row['ALLURIScontextFromNCBO']
1802
+ iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO))
1803
+
1804
+ if strtobool(args.debug):
1805
+ print(f"\n----- isForced COMPUTING ... {row['word']} IN THE TEXT:")
1806
+ print(row[args.source_column])
1807
+
1808
+ result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
1809
+ row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output,
1810
+ id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO,UseBioportalForLinking=True)
1811
+
1812
+ if not result: #try annotation without bioportal
1813
+ result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
1814
+ row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output,
1815
+ id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO, UseBioportalForLinking=False)
1816
 
1817
  else:
1818
  if (row['IsBio'] == 1) or ( (pd.isnull(row["IsBio"]) or row["IsBio"] == '' or row['IsBio'] == 0 or row["IsBio"] is None) and (row['entity_group'] == "MISC") ):
 
1836
  iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO))
1837
 
1838
  result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
1839
+ row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO,UseBioportalForLinking=True)
1840
 
1841
  return result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames, cache_map_virtuoso, load_map_query_input_output, row.name
1842
 
 
1945
  parser.add_argument("--num_cores_Gliner", type=int, default=num_cores_Gliner_forDemo, help="parallel processing for Gliner annotation") # 0 means use the GPU for Gliner !
1946
 
1947
  parser.add_argument("--entity_linking", type=str, default=EnableNEL, help="whether to make entities linking or not")
1948
+ parser.add_argument("--geonameskey_filename", type=str, default="GEONAMES-API.key", help="file location where it is stored the geonames api key")
1949
+ parser.add_argument("--virtuosokey_filename", type=str, default="VIRTUOSO-dba.key", help="file location where it is stored the virtuoso endpoint dba pwd")
1950
+ parser.add_argument("--bioportalkey_filename", type=str, default="NCBO-BioPortal.key", help="file location where it is stored the NCBO BioPortal api key")
1951
 
1952
  # consose 20250205:
1953
  # KGchoices = None
 
1966
  # parser.add_argument("--USE_CACHE", type=str, default="False",
1967
  # help="whether to use cache for the NER and NEL tasks or not")
1968
  parser.add_argument("--USE_CACHE", type=str, default="False", help="whether to use cache for the NER and NEL tasks or not")
1969
+
1970
  parser.add_argument("--num_cores_eLinking", type=int, default=1, help="parallel processing for the entity linking process")
1971
 
1972
  parser.add_argument("--computeEntityContext", type=str, default="False",
 
1982
 
1983
  args = parser.parse_args()
1984
 
1985
+ df_ToAnnotate = pd.DataFrame()
1986
 
1987
  #print("Are all models in any row of the 'model' column, case-insensitively?", all_models_in_any_row)
1988
  #if (not history_dict) or (history_dict[args.source_column][0] != text) or (all_models_in_any_row == False):
 
2054
  df_annotated = history.copy()
2055
 
2056
 
2057
+ quoted_text = text.startswith('"') & text.endswith('"')
2058
+ if (not df_annotated.empty) or quoted_text:
 
 
 
 
 
2059
 
2060
+ if (not df_annotated.empty):
2061
+ # filter now per models selection
2062
+ df_annotated = df_annotated[df_annotated['model'].str.lower().isin([model.lower() for model in ModelsSelection])]
2063
+ if df_annotated.empty and quoted_text==False:
2064
+ html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
2065
+ return {"text": text, "entities": []}, html_output, history.to_dict()
2066
 
2067
+ df_annotated_combined = pd.DataFrame()
2068
+ if (not df_annotated.empty):
2069
+ df_annotated_combined = entitiesFusion(df_annotated,args)
2070
+ if df_annotated_combined.empty and quoted_text==False:
2071
+ html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
2072
+ return {"text": text, "entities": []}, html_output, history.to_dict()
2073
+ else:
2074
+ df_annotated_combined = is_cross_inside(df_annotated_combined, args, 0.999) #I cut all the cross inside with the 0.99. to avoid the linking
2075
 
2076
 
2077
  cache_prefix_fp = "LLMQUERYNER"
 
2122
  # key_virtuoso = f.read()
2123
  key_virtuoso = os.environ['key_virtuoso']
2124
 
2125
+ # Here for the EXACT MATCHING "" - if the desired term has not been identified in the NER, add to the dataframe:
2126
+
2127
+ if df_ToAnnotate.empty:
2128
+ df_ToAnnotate = pd.DataFrame({"ToLink": [None], args.source_column: [text]})
2129
+
2130
+ if "SentenceRef" not in df_ToAnnotate.columns:
2131
+ df_ToAnnotate["SentenceRef"] = None
2132
+ df_ToAnnotate = df_ToAnnotate[['SentenceRef'] + [col for col in df_ToAnnotate.columns if
2133
+ col != 'SentenceRef']] # this moves it to the first position
2134
+
2135
+ df_ToAnnotate['SentenceRef'] = df_ToAnnotate.index + 1
2136
+ df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].argsort().groupby(
2137
+ df_ToAnnotate[args.source_column]).transform('min').astype(int)
2138
+ df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].rank(method='dense').astype(int)
2139
+
2140
+ # Define the condition to find missing SentenceRefs
2141
+ missing_sentence_refs = ~df_ToAnnotate['SentenceRef'].isin(df_annotated_combined['SentenceRef'])
2142
+
2143
+ # Define the condition to check if ContextToAnnotate starts and ends with quotes
2144
+ quoted_context = df_ToAnnotate['ContextToAnnotate'].str.startswith('"') & df_ToAnnotate[
2145
+ 'ContextToAnnotate'].str.endswith('"')
2146
+
2147
+ # Combine both conditions
2148
+ condition = missing_sentence_refs & quoted_context
2149
+
2150
+ # Select rows from df_ToAnnotate that meet the condition
2151
+ rows_to_add = df_ToAnnotate[condition]
2152
+
2153
+ rows_to_add['model'] = "Forced"
2154
+ rows_to_add['entity_group'] = "MISC"
2155
+ rows_to_add['word'] = rows_to_add['ContextToAnnotate']
2156
+ rows_to_add['word'] = rows_to_add['ContextToAnnotate'].apply(strip_quotes)
2157
+ rows_to_add['score'] = 1.0
2158
+ rows_to_add['start'] = int(1)
2159
+ rows_to_add['end'] = rows_to_add['word'].apply(len) + int(1)
2160
+ rows_to_add['IsGeo'] = None
2161
+ rows_to_add['IsBio'] = None
2162
+ rows_to_add['IsCrossInside'] = 0.0
2163
+
2164
+ if df_annotated_combined.empty:
2165
+ df_annotated_combined = pd.DataFrame(columns=df_ToAnnotate.columns)
2166
+
2167
+ # Append these rows to df_annotated_combined
2168
+ df_annotated_combined = pd.concat([df_annotated_combined, rows_to_add], ignore_index=True)
2169
+
2170
+ df_annotated_combined['start'] = df_annotated_combined['start'].astype(int)
2171
+ df_annotated_combined['end'] = df_annotated_combined['end'].astype(int)
2172
+
2173
+ df_annotated_combined = df_annotated_combined.sort_values(
2174
+ by=['SentenceRef', 'start', 'ToLink', 'word', 'score'],
2175
+ ascending=[True, True, True, True, False])
2176
+
2177
+ # Now df_annotated_combined contains the additional rows
2178
+
2179
  df_annotated_combined, cache_map_geonames_AFTER, cache_map_virtuoso_AFTER, load_map_query_input_output_AFTER = elinking(df_annotated_combined,
2180
  text_splitter, args, key_geonames,
2181
  cache_map_geonames,
 
2368
 
2369
 
2370
 
2371
+ demo.launch()
2372
+ #demo.launch(share=True) # Share your demo with just 1 extra parameter