jattokatarratto commited on
Commit
ef18338
·
verified ·
1 Parent(s): d2850ad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -33
app.py CHANGED
@@ -1,5 +1,11 @@
1
  import os
2
 
 
 
 
 
 
 
3
  from transformers import file_utils
4
  print(file_utils.default_cache_path)
5
 
@@ -21,10 +27,10 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
21
  from collections import Counter
22
 
23
  #os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
24
- #os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
25
 
26
  import torch
27
- #torch.cuda.empty_cache() # Clear cache ot torch
28
 
29
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
30
  print(f"Device: {device}...")
@@ -496,12 +502,12 @@ def annotate(df, args, pipeInner, tokenizerGliner, modelGliner, modelGlinerBio,
496
  #https://data.bioontology.org/documentation#nav_annotator
497
  #https://bioportal.bioontology.org/annotatorplus
498
 
499
- key_bioportal = ""
500
- if args.bioportalkey_filename:
501
- fkeyname = args.bioportalkey_filename
502
- with open(fkeyname) as f:
503
- key_bioportal = f.read()
504
- #key_bioportal = os.environ['key_bioportal']
505
 
506
  df_annot = pd.DataFrame()
507
  for drm_idx, row in tqdm(df.iterrows()):
@@ -886,9 +892,9 @@ def entitiesFusion(df_annotated, args):
886
  # Delete all the rows where EXACT MATCHING NOT MET:
887
  # Apply the conditions
888
  condition_to_delete = (
889
- df_annotated['ContextToAnnotate'].str.startswith('"') &
890
- df_annotated['ContextToAnnotate'].str.endswith('"') &
891
- (df_annotated['ContextToAnnotate'].apply(strip_quotes).str.lower() != df_annotated['word'].str.lower())
892
  )
893
 
894
  # Now Filter out the rows where condition_to_delete is True
@@ -1076,12 +1082,12 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
1076
  entityBioeUrl = None
1077
  ALLURIScontext = []
1078
 
1079
- key_bioportal = ""
1080
- if args.bioportalkey_filename:
1081
- fkeyname = args.bioportalkey_filename
1082
- with open(fkeyname) as f:
1083
- key_bioportal = f.read()
1084
- #key_bioportal = os.environ['key_bioportal']
1085
 
1086
  # Check if args.KG_restriction exists and is not empty
1087
  if getattr(args, 'KG_restriction', None):
@@ -2310,12 +2316,12 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
2310
  else:
2311
  cache_map_geonames = {}
2312
 
2313
- key_geonames = ""
2314
- if args.geonameskey_filename:
2315
- fkeyname = args.geonameskey_filename
2316
- with open(fkeyname) as f:
2317
- key_geonames = f.read()
2318
- #key_geonames = os.environ['key_geonames']
2319
 
2320
  cache_map_virtuoso = None
2321
  if strtobool(args.USE_CACHE):
@@ -2326,12 +2332,12 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
2326
  else:
2327
  cache_map_virtuoso = {}
2328
 
2329
- key_virtuoso = ""
2330
- if args.virtuosokey_filename:
2331
- fkeyname = args.virtuosokey_filename
2332
- with open(fkeyname) as f:
2333
- key_virtuoso = f.read()
2334
- #key_virtuoso = os.environ['key_virtuoso']
2335
 
2336
  # Here for the EXACT MATCHING "" - if the desired term has not been identified in the NER, add to the dataframe:
2337
 
@@ -2352,8 +2358,8 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
2352
  missing_sentence_refs = ~df_ToAnnotate['SentenceRef'].isin(df_annotated_combined['SentenceRef'])
2353
 
2354
  # Define the condition to check if ContextToAnnotate starts and ends with quotes
2355
- quoted_context = df_ToAnnotate['ContextToAnnotate'].str.startswith('"') & df_ToAnnotate[
2356
- 'ContextToAnnotate'].str.endswith('"')
2357
 
2358
  # Combine both conditions
2359
  condition = missing_sentence_refs & quoted_context
@@ -2363,8 +2369,8 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
2363
 
2364
  rows_to_add['model'] = "Forced"
2365
  rows_to_add['entity_group'] = "MISC"
2366
- rows_to_add['word'] = rows_to_add['ContextToAnnotate']
2367
- rows_to_add['word'] = rows_to_add['ContextToAnnotate'].apply(strip_quotes)
2368
  rows_to_add['score'] = 1.0
2369
  rows_to_add['start'] = int(1)
2370
  rows_to_add['end'] = rows_to_add['word'].apply(len) + int(1)
 
1
  import os
2
 
3
+ os.environ["CUDA_VISIBLE_DEVICES"] = "1,6" # to use the GPUs 3,4 only
4
+
5
+ os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
6
+ os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
7
+ os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
8
+
9
  from transformers import file_utils
10
  print(file_utils.default_cache_path)
11
 
 
27
  from collections import Counter
28
 
29
  #os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
30
+ os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
31
 
32
  import torch
33
+ torch.cuda.empty_cache() # Clear cache ot torch
34
 
35
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
36
  print(f"Device: {device}...")
 
502
  #https://data.bioontology.org/documentation#nav_annotator
503
  #https://bioportal.bioontology.org/annotatorplus
504
 
505
+ #key_bioportal = ""
506
+ #if args.bioportalkey_filename:
507
+ # fkeyname = args.bioportalkey_filename
508
+ # with open(fkeyname) as f:
509
+ # key_bioportal = f.read()
510
+ key_bioportal = os.environ['key_bioportal']
511
 
512
  df_annot = pd.DataFrame()
513
  for drm_idx, row in tqdm(df.iterrows()):
 
892
  # Delete all the rows where EXACT MATCHING NOT MET:
893
  # Apply the conditions
894
  condition_to_delete = (
895
+ df_annotated[args.source_column].str.startswith('"') &
896
+ df_annotated[args.source_column].str.endswith('"') &
897
+ (df_annotated[args.source_column].apply(strip_quotes).str.lower() != df_annotated['word'].str.lower())
898
  )
899
 
900
  # Now Filter out the rows where condition_to_delete is True
 
1082
  entityBioeUrl = None
1083
  ALLURIScontext = []
1084
 
1085
+ #key_bioportal = ""
1086
+ #if args.bioportalkey_filename:
1087
+ # fkeyname = args.bioportalkey_filename
1088
+ # with open(fkeyname) as f:
1089
+ # key_bioportal = f.read()
1090
+ key_bioportal = os.environ['key_bioportal']
1091
 
1092
  # Check if args.KG_restriction exists and is not empty
1093
  if getattr(args, 'KG_restriction', None):
 
2316
  else:
2317
  cache_map_geonames = {}
2318
 
2319
+ #key_geonames = ""
2320
+ #if args.geonameskey_filename:
2321
+ # fkeyname = args.geonameskey_filename
2322
+ # with open(fkeyname) as f:
2323
+ # key_geonames = f.read()
2324
+ key_geonames = os.environ['key_geonames']
2325
 
2326
  cache_map_virtuoso = None
2327
  if strtobool(args.USE_CACHE):
 
2332
  else:
2333
  cache_map_virtuoso = {}
2334
 
2335
+ #key_virtuoso = ""
2336
+ #if args.virtuosokey_filename:
2337
+ # fkeyname = args.virtuosokey_filename
2338
+ # with open(fkeyname) as f:
2339
+ # key_virtuoso = f.read()
2340
+ key_virtuoso = os.environ['key_virtuoso']
2341
 
2342
  # Here for the EXACT MATCHING "" - if the desired term has not been identified in the NER, add to the dataframe:
2343
 
 
2358
  missing_sentence_refs = ~df_ToAnnotate['SentenceRef'].isin(df_annotated_combined['SentenceRef'])
2359
 
2360
  # Define the condition to check if ContextToAnnotate starts and ends with quotes
2361
+ quoted_context = df_ToAnnotate[args.source_column].str.startswith('"') & df_ToAnnotate[
2362
+ args.source_column].str.endswith('"')
2363
 
2364
  # Combine both conditions
2365
  condition = missing_sentence_refs & quoted_context
 
2369
 
2370
  rows_to_add['model'] = "Forced"
2371
  rows_to_add['entity_group'] = "MISC"
2372
+ rows_to_add['word'] = rows_to_add[args.source_column]
2373
+ rows_to_add['word'] = rows_to_add[args.source_column].apply(strip_quotes)
2374
  rows_to_add['score'] = 1.0
2375
  rows_to_add['start'] = int(1)
2376
  rows_to_add['end'] = rows_to_add['word'].apply(len) + int(1)