Spaces:

jrc-ai
/

MultiNER-simplified

Running

App Files Files Community

Consoli Sergio commited on Feb 14

Commit

5a9842d

1 Parent(s): 193f79d

other sync changes

Browse files

Files changed (2) hide show

app-demo-myMultiNER.py +20 -12
nerBio.py +16 -12

app-demo-myMultiNER.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import os
-#os.environ["CUDA_VISIBLE_DEVICES"] = "1,6" # to use the GPUs 3,4 only
 #
-#os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
-#os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
-#os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
 from transformers import file_utils
-#print(file_utils.default_cache_path)
 import pandas as pd
 from tqdm import tqdm
@@ -19,12 +19,12 @@ from collections import Counter
 from transformers import pipeline, AutoTokenizer
 #os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
-#os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
 #import html
 import torch
-#torch.cuda.empty_cache()  # Clear cache ot torch
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 print(f"Device: {device}...")
@@ -48,6 +48,14 @@ from nerBio import annotate, entitiesFusion, is_cross_inside, elinking
 from llmqueryNer import call_model, call_model_with_caching, process_list, setup_gptjrc, api_call_gptjrc, model_list_gptjrc
@@ -68,8 +76,8 @@ examples = [
-#models_List = ["FacebookAI/xlm-roberta-large-finetuned-conll03-english",  "Babelscape/wikineural-multilingual-ner",  "blaze999/Medical-NER", "urchade/gliner_large-v2.1", "urchade/gliner_large_bio-v0.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1",  "knowledgator/gliner-multitask-large-v0.5"
-models_List = ["Babelscape/wikineural-multilingual-ner",  "urchade/gliner_large-v2.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1",  "knowledgator/gliner-multitask-large-v0.5"
 #models_List = ["NCBO/BioPortal" ]
 #categories_List = ["MED","LOC","PER","ORG","DATE","MISC"]
@@ -216,7 +224,7 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
                         help="List of ontologies to which restrict the entity linking task.")
     #consose 20250502:
     if Counter(KGchoices) == Counter(POSSIBLE_KGchoices_List):
-        parser.add_argument("--USE_CACHE", type=str, default="False",
                             help="whether to use cache for the NER and NEL tasks or not")
     else:
         #print("Lists do not have the same elements")
@@ -384,7 +392,7 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
                     cache_map_geonames = {}
             key_geonames = ""
-            if args.geonameskey_filename:
                 fkeyname = args.geonameskey_filename
                 with open(fkeyname) as f:
                     key_geonames = f.read()
@@ -401,7 +409,7 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
                     cache_map_virtuoso = {}
             key_virtuoso = ""
-            if args.virtuosokey_filename:
                 fkeyname = args.virtuosokey_filename
                 with open(fkeyname) as f:
                     key_virtuoso = f.read()

 import os
+# os.environ["CUDA_VISIBLE_DEVICES"] = "1,6" # to use the GPUs 3,4 only
 #
+# os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
+# os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
+# os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
 from transformers import file_utils
+print(file_utils.default_cache_path)
 import pandas as pd
 from tqdm import tqdm
 from transformers import pipeline, AutoTokenizer
 #os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
+os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
 #import html
 import torch
+torch.cuda.empty_cache()  # Clear cache ot torch
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 print(f"Device: {device}...")
 from llmqueryNer import call_model, call_model_with_caching, process_list, setup_gptjrc, api_call_gptjrc, model_list_gptjrc
+from joblib import Memory
+cachedir = 'cached'
+mem = Memory(cachedir, verbose=False)
+# this is to completely delete the cache:
+# mem.clear(warn=False)
+models_List = ["FacebookAI/xlm-roberta-large-finetuned-conll03-english",  "Babelscape/wikineural-multilingual-ner",  "blaze999/Medical-NER", "urchade/gliner_large-v2.1", "urchade/gliner_large_bio-v0.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1",  "knowledgator/gliner-multitask-large-v0.5"
+#models_List = ["Babelscape/wikineural-multilingual-ner",  "urchade/gliner_large-v2.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1",  "knowledgator/gliner-multitask-large-v0.5"
 #models_List = ["NCBO/BioPortal" ]
 #categories_List = ["MED","LOC","PER","ORG","DATE","MISC"]
                         help="List of ontologies to which restrict the entity linking task.")
     #consose 20250502:
     if Counter(KGchoices) == Counter(POSSIBLE_KGchoices_List):
+        parser.add_argument("--USE_CACHE", type=str, default="True",
                             help="whether to use cache for the NER and NEL tasks or not")
     else:
         #print("Lists do not have the same elements")
                     cache_map_geonames = {}
             key_geonames = ""
+            if args.geonameskey_filename and os.path.exists(args.geonameskey_filename):
                 fkeyname = args.geonameskey_filename
                 with open(fkeyname) as f:
                     key_geonames = f.read()
                     cache_map_virtuoso = {}
             key_virtuoso = ""
+            if args.virtuosokey_filename and os.path.exists(args.virtuosokey_filename):
                 fkeyname = args.virtuosokey_filename
                 with open(fkeyname) as f:
                     key_virtuoso = f.read()

nerBio.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import os
-#os.environ["CUDA_VISIBLE_DEVICES"] = "1,6"  #GPUs to use
-#
-#os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
-#os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
-#os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
 from transformers import file_utils
@@ -21,10 +21,10 @@ from collections import Counter
 from gliner import GLiNER, GLiNERConfig, data_processing
 #os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
-#os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
 import torch
-#torch.cuda.empty_cache()  # Clear cache ot torch
 import logging
@@ -67,6 +67,10 @@ import numpy as np
 from retrieverRAG_testing import RAG_retrieval_Base, RAG_retrieval_Z_scores, RAG_retrieval_Percentile, RAG_retrieval_TopK
 # this is to completely delete the cache:
 # mem.clear(warn=False)
@@ -384,7 +388,7 @@ def annotate(df, args, pipeInner, tokenizerGliner, modelGliner, modelGlinerBio,
         #https://bioportal.bioontology.org/annotatorplus
         key_bioportal = ""
-        if args.bioportalkey_filename:
             fkeyname = args.bioportalkey_filename
             with open(fkeyname) as f:
                 key_bioportal = f.read()
@@ -1200,7 +1204,7 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
     ALLURIScontext = []
     key_bioportal = ""
-    if args.bioportalkey_filename:
         fkeyname = args.bioportalkey_filename
         with open(fkeyname) as f:
             key_bioportal = f.read()
@@ -2321,7 +2325,7 @@ if __name__ == '__main__':
         #         cache_map_bioportal = {}
         #
         # key_bioportal = ""
-        # if args.bioportalkey_filename:
         #     fkeyname = args.bioportalkey_filename
         #     with open(fkeyname) as f:
         #         key_bioportal = f.read()
@@ -2441,7 +2445,7 @@ if __name__ == '__main__':
                 cache_map_geonames = {}
         key_geonames = ""
-        if args.geonameskey_filename:
             fkeyname = args.geonameskey_filename
             with open(fkeyname) as f:
                 key_geonames = f.read()
@@ -2458,7 +2462,7 @@ if __name__ == '__main__':
                 cache_map_virtuoso = {}
         key_virtuoso = ""
-        if args.virtuosokey_filename:
             fkeyname = args.virtuosokey_filename
             with open(fkeyname) as f:
                 key_virtuoso = f.read()

 import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "1,6"  #GPUs to use
+os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
+os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
+os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
 from transformers import file_utils
 from gliner import GLiNER, GLiNERConfig, data_processing
 #os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
+os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
 import torch
+torch.cuda.empty_cache()  # Clear cache ot torch
 import logging
 from retrieverRAG_testing import RAG_retrieval_Base, RAG_retrieval_Z_scores, RAG_retrieval_Percentile, RAG_retrieval_TopK
+from joblib import Memory
+cachedir = 'cached'
+mem = Memory(cachedir, verbose=False)
 # this is to completely delete the cache:
 # mem.clear(warn=False)
         #https://bioportal.bioontology.org/annotatorplus
         key_bioportal = ""
+        if args.bioportalkey_filename and os.path.exists(args.bioportalkey_filename):
             fkeyname = args.bioportalkey_filename
             with open(fkeyname) as f:
                 key_bioportal = f.read()
     ALLURIScontext = []
     key_bioportal = ""
+    if args.bioportalkey_filename and os.path.exists(args.bioportalkey_filename):
         fkeyname = args.bioportalkey_filename
         with open(fkeyname) as f:
             key_bioportal = f.read()
         #         cache_map_bioportal = {}
         #
         # key_bioportal = ""
+        # if args.bioportalkey_filename and os.path.exists(args.bioportalkey_filename):
         #     fkeyname = args.bioportalkey_filename
         #     with open(fkeyname) as f:
         #         key_bioportal = f.read()
                 cache_map_geonames = {}
         key_geonames = ""
+        if args.geonameskey_filename and os.path.exists(args.geonameskey_filename):
             fkeyname = args.geonameskey_filename
             with open(fkeyname) as f:
                 key_geonames = f.read()
                 cache_map_virtuoso = {}
         key_virtuoso = ""
+        if args.virtuosokey_filename and os.path.exists(args.virtuosokey_filename):
             fkeyname = args.virtuosokey_filename
             with open(fkeyname) as f:
                 key_virtuoso = f.read()