Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,11 @@
|
|
1 |
import os
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
from transformers import file_utils
|
4 |
print(file_utils.default_cache_path)
|
5 |
|
@@ -21,10 +27,10 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
21 |
from collections import Counter
|
22 |
|
23 |
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
|
24 |
-
|
25 |
|
26 |
import torch
|
27 |
-
|
28 |
|
29 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
30 |
print(f"Device: {device}...")
|
@@ -496,12 +502,12 @@ def annotate(df, args, pipeInner, tokenizerGliner, modelGliner, modelGlinerBio,
|
|
496 |
#https://data.bioontology.org/documentation#nav_annotator
|
497 |
#https://bioportal.bioontology.org/annotatorplus
|
498 |
|
499 |
-
key_bioportal = ""
|
500 |
-
if args.bioportalkey_filename:
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
|
506 |
df_annot = pd.DataFrame()
|
507 |
for drm_idx, row in tqdm(df.iterrows()):
|
@@ -886,9 +892,9 @@ def entitiesFusion(df_annotated, args):
|
|
886 |
# Delete all the rows where EXACT MATCHING NOT MET:
|
887 |
# Apply the conditions
|
888 |
condition_to_delete = (
|
889 |
-
df_annotated[
|
890 |
-
df_annotated[
|
891 |
-
(df_annotated[
|
892 |
)
|
893 |
|
894 |
# Now Filter out the rows where condition_to_delete is True
|
@@ -1076,12 +1082,12 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
|
|
1076 |
entityBioeUrl = None
|
1077 |
ALLURIScontext = []
|
1078 |
|
1079 |
-
key_bioportal = ""
|
1080 |
-
if args.bioportalkey_filename:
|
1081 |
-
|
1082 |
-
|
1083 |
-
|
1084 |
-
|
1085 |
|
1086 |
# Check if args.KG_restriction exists and is not empty
|
1087 |
if getattr(args, 'KG_restriction', None):
|
@@ -2310,12 +2316,12 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
|
|
2310 |
else:
|
2311 |
cache_map_geonames = {}
|
2312 |
|
2313 |
-
key_geonames = ""
|
2314 |
-
if args.geonameskey_filename:
|
2315 |
-
|
2316 |
-
|
2317 |
-
|
2318 |
-
|
2319 |
|
2320 |
cache_map_virtuoso = None
|
2321 |
if strtobool(args.USE_CACHE):
|
@@ -2326,12 +2332,12 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
|
|
2326 |
else:
|
2327 |
cache_map_virtuoso = {}
|
2328 |
|
2329 |
-
key_virtuoso = ""
|
2330 |
-
if args.virtuosokey_filename:
|
2331 |
-
|
2332 |
-
|
2333 |
-
|
2334 |
-
|
2335 |
|
2336 |
# Here for the EXACT MATCHING "" - if the desired term has not been identified in the NER, add to the dataframe:
|
2337 |
|
@@ -2352,8 +2358,8 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
|
|
2352 |
missing_sentence_refs = ~df_ToAnnotate['SentenceRef'].isin(df_annotated_combined['SentenceRef'])
|
2353 |
|
2354 |
# Define the condition to check if ContextToAnnotate starts and ends with quotes
|
2355 |
-
quoted_context = df_ToAnnotate[
|
2356 |
-
|
2357 |
|
2358 |
# Combine both conditions
|
2359 |
condition = missing_sentence_refs & quoted_context
|
@@ -2363,8 +2369,8 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
|
|
2363 |
|
2364 |
rows_to_add['model'] = "Forced"
|
2365 |
rows_to_add['entity_group'] = "MISC"
|
2366 |
-
rows_to_add['word'] = rows_to_add[
|
2367 |
-
rows_to_add['word'] = rows_to_add[
|
2368 |
rows_to_add['score'] = 1.0
|
2369 |
rows_to_add['start'] = int(1)
|
2370 |
rows_to_add['end'] = rows_to_add['word'].apply(len) + int(1)
|
|
|
1 |
import os
|
2 |
|
3 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "1,6" # to use the GPUs 3,4 only
|
4 |
+
|
5 |
+
os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
|
6 |
+
os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
|
7 |
+
os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
|
8 |
+
|
9 |
from transformers import file_utils
|
10 |
print(file_utils.default_cache_path)
|
11 |
|
|
|
27 |
from collections import Counter
|
28 |
|
29 |
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
|
30 |
+
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
|
31 |
|
32 |
import torch
|
33 |
+
torch.cuda.empty_cache() # Clear cache ot torch
|
34 |
|
35 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
36 |
print(f"Device: {device}...")
|
|
|
502 |
#https://data.bioontology.org/documentation#nav_annotator
|
503 |
#https://bioportal.bioontology.org/annotatorplus
|
504 |
|
505 |
+
#key_bioportal = ""
|
506 |
+
#if args.bioportalkey_filename:
|
507 |
+
# fkeyname = args.bioportalkey_filename
|
508 |
+
# with open(fkeyname) as f:
|
509 |
+
# key_bioportal = f.read()
|
510 |
+
key_bioportal = os.environ['key_bioportal']
|
511 |
|
512 |
df_annot = pd.DataFrame()
|
513 |
for drm_idx, row in tqdm(df.iterrows()):
|
|
|
892 |
# Delete all the rows where EXACT MATCHING NOT MET:
|
893 |
# Apply the conditions
|
894 |
condition_to_delete = (
|
895 |
+
df_annotated[args.source_column].str.startswith('"') &
|
896 |
+
df_annotated[args.source_column].str.endswith('"') &
|
897 |
+
(df_annotated[args.source_column].apply(strip_quotes).str.lower() != df_annotated['word'].str.lower())
|
898 |
)
|
899 |
|
900 |
# Now Filter out the rows where condition_to_delete is True
|
|
|
1082 |
entityBioeUrl = None
|
1083 |
ALLURIScontext = []
|
1084 |
|
1085 |
+
#key_bioportal = ""
|
1086 |
+
#if args.bioportalkey_filename:
|
1087 |
+
# fkeyname = args.bioportalkey_filename
|
1088 |
+
# with open(fkeyname) as f:
|
1089 |
+
# key_bioportal = f.read()
|
1090 |
+
key_bioportal = os.environ['key_bioportal']
|
1091 |
|
1092 |
# Check if args.KG_restriction exists and is not empty
|
1093 |
if getattr(args, 'KG_restriction', None):
|
|
|
2316 |
else:
|
2317 |
cache_map_geonames = {}
|
2318 |
|
2319 |
+
#key_geonames = ""
|
2320 |
+
#if args.geonameskey_filename:
|
2321 |
+
# fkeyname = args.geonameskey_filename
|
2322 |
+
# with open(fkeyname) as f:
|
2323 |
+
# key_geonames = f.read()
|
2324 |
+
key_geonames = os.environ['key_geonames']
|
2325 |
|
2326 |
cache_map_virtuoso = None
|
2327 |
if strtobool(args.USE_CACHE):
|
|
|
2332 |
else:
|
2333 |
cache_map_virtuoso = {}
|
2334 |
|
2335 |
+
#key_virtuoso = ""
|
2336 |
+
#if args.virtuosokey_filename:
|
2337 |
+
# fkeyname = args.virtuosokey_filename
|
2338 |
+
# with open(fkeyname) as f:
|
2339 |
+
# key_virtuoso = f.read()
|
2340 |
+
key_virtuoso = os.environ['key_virtuoso']
|
2341 |
|
2342 |
# Here for the EXACT MATCHING "" - if the desired term has not been identified in the NER, add to the dataframe:
|
2343 |
|
|
|
2358 |
missing_sentence_refs = ~df_ToAnnotate['SentenceRef'].isin(df_annotated_combined['SentenceRef'])
|
2359 |
|
2360 |
# Define the condition to check if ContextToAnnotate starts and ends with quotes
|
2361 |
+
quoted_context = df_ToAnnotate[args.source_column].str.startswith('"') & df_ToAnnotate[
|
2362 |
+
args.source_column].str.endswith('"')
|
2363 |
|
2364 |
# Combine both conditions
|
2365 |
condition = missing_sentence_refs & quoted_context
|
|
|
2369 |
|
2370 |
rows_to_add['model'] = "Forced"
|
2371 |
rows_to_add['entity_group'] = "MISC"
|
2372 |
+
rows_to_add['word'] = rows_to_add[args.source_column]
|
2373 |
+
rows_to_add['word'] = rows_to_add[args.source_column].apply(strip_quotes)
|
2374 |
rows_to_add['score'] = 1.0
|
2375 |
rows_to_add['start'] = int(1)
|
2376 |
rows_to_add['end'] = rows_to_add['word'].apply(len) + int(1)
|