Consoli Sergio commited on
Commit
232b620
·
1 Parent(s): 2800a02

major commit for change to interface to gradio Blocks

Browse files
Files changed (7) hide show
  1. .gitignore +116 -0
  2. app-demo-myMultiNER.py +826 -0
  3. common.py +2 -1
  4. llmqueryNer.py +881 -0
  5. nerBio.py +0 -0
  6. retrieverRAG_testing.py +339 -0
  7. virtuosoQueryRest.py +14 -17
.gitignore ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+
4
+ .idea/
5
+
6
+ config.py
7
+
8
+ .gradio
9
+ .gradio/
10
+ .gradio/*
11
+
12
+ #
13
+ # Project specific excludes
14
+ #
15
+
16
+ *.log
17
+ *.key
18
+ *.env
19
+
20
+ *.csv
21
+ *.xlsx
22
+
23
+ med_news.txt
24
+
25
+ example_sergio.sh
26
+ example_sergio_summary.sh
27
+
28
+ screenlog.0
29
+
30
+ /DONS/
31
+ /UCI_ML_Repository/
32
+ /cached/
33
+ /DONS/*
34
+ /UCI_ML_Repository/*
35
+ /cached/*
36
+
37
+ /prove/
38
+ /prove/*
39
+
40
+ *.json
41
+
42
+ /__pycache__/
43
+ /__pychache__/*
44
+
45
+ /vast_api_logs/
46
+ /vast_api_logs/*
47
+
48
+ *.tpl
49
+
50
+
51
+ ./.settings/
52
+
53
+
54
+
55
+
56
+ *.Rhistory
57
+ *.Rproj
58
+ *.RData
59
+
60
+ tomcat
61
+
62
+ #
63
+ # Default excludes
64
+ #
65
+
66
+ # Binaries
67
+ *.7z
68
+ *.dmg
69
+ *.gz
70
+ *.iso
71
+ *.jar
72
+ *.rar
73
+ *.tar
74
+ *.zip
75
+ *.war
76
+ *.ear
77
+ *.sar
78
+ *.class
79
+
80
+ # Maven
81
+ target/
82
+
83
+ # IntelliJ project files
84
+ *.iml
85
+ *.iws
86
+ *.ipr
87
+ .idea/
88
+
89
+ # eclipse project file
90
+ .settings/
91
+ .classpath
92
+ .project
93
+
94
+ # NetBeans specific
95
+ nbproject/private/
96
+ build/
97
+ nbbuild/
98
+ dist/
99
+ nbdist/
100
+ nbactions.xml
101
+ nb-configuration.xml
102
+
103
+
104
+ # OS
105
+ .DS_Store
106
+
107
+ # Misc
108
+ *.swp
109
+ release.properties
110
+ pom.xml.releaseBackup
111
+ pom.xml.tag
112
+ __pycache__
113
+
114
+ .Rproj.user
115
+
116
+ /bin/
app-demo-myMultiNER.py ADDED
@@ -0,0 +1,826 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ os.environ["CUDA_VISIBLE_DEVICES"] = "1,6" # to use the GPUs 3,4 only
4
+
5
+ os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
6
+ os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
7
+ os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
8
+
9
+ from transformers import file_utils
10
+ print(file_utils.default_cache_path)
11
+
12
+ import pandas as pd
13
+ from tqdm import tqdm
14
+ from gliner import GLiNER
15
+ import logging
16
+ from jinja2 import Template
17
+ from collections import Counter
18
+
19
+ from transformers import pipeline, AutoTokenizer
20
+
21
+ #os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
22
+ os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
23
+
24
+ #import html
25
+
26
+ import torch
27
+ torch.cuda.empty_cache() # Clear cache ot torch
28
+
29
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
30
+ print(f"Device: {device}...")
31
+ if device.type == "cuda":
32
+ print("GPU number:", torch.cuda.current_device())
33
+
34
+ import datasets
35
+
36
+ import argparse
37
+ import json
38
+ import random
39
+ import numpy as np
40
+
41
+ import tiktoken
42
+ from langchain.text_splitter import TokenTextSplitter
43
+
44
+ import gradio as gr
45
+ import re
46
+ from common import strtobool, token_counter, encoding_getter, strip_quotes
47
+ from nerBio import annotate, entitiesFusion, is_cross_inside, elinking
48
+ from llmqueryNer import call_model, call_model_with_caching, process_list, setup_gptjrc, api_call_gptjrc, model_list_gptjrc
49
+
50
+
51
+ from joblib import Memory
52
+
53
+ cachedir = 'cached'
54
+ mem = Memory(cachedir, verbose=False)
55
+
56
+ # this is to completely delete the cache:
57
+ # mem.clear(warn=False)
58
+
59
+
60
+
61
+
62
+
63
+ examples = [
64
+ ["He said the disease was 1st detected to have spread to Malaysia from Africa in 1997. " , None],
65
+ ["He said the disease which was 1st detected in Johor had spread to Negeri Sembilan, Melaka, Perak, Selangor and the latest Kedah. He said the disease was 1st detected to have spread to Malaysia from Africa in 1997. ", None],
66
+ ["The Health Ministry has detected about 4000 suspected chikungunya cases nationwide this year [2008], Minister Datuk Liow Tiong Lai said Wednesday [17 Dec 2008]. ", None],
67
+ ["The Health Ministry has detected about 4000 suspected chikungunya cases nationwide this year [2008], Minister Datuk Liow Tiong Lai said Wednesday [17 Dec 2008]. He said the disease which was 1st detected in Johor had spread to Negeri Sembilan, Melaka, Perak, Selangor and the latest Kedah. \"So far, the chikungunya disease is still under control nationwide,\" he told reporters after visiting Sultanah Nur Zahirah Hospital here. Present was Terengganu Health Director Dr. Nordiyanah Hassan. Liow said that so far, there is no specific medicine to treat the chikungunya fever disease spread by _Aedes_ mosquito. \"So, I would like to call on the public to be careful particularly during the wet season now because _Aedes_ mosquito is easy to breed,\" he said. To contain the spread of the disease, he said, the ministry had taken several measures including intensifying the campaign to rid of _Aedes_ mosquito and holding lectures on the outbreak. He said the disease was 1st detected to have spread to Malaysia from Africa in 1997. Meanwhile, he said 63 health projects costing RM458 million [USD 131 230 211] had been approved for implementation in Terengganu under the Ninth Malaysia Plan and some had started.", None],
68
+ ["Carcinoma", None],
69
+ ["The doctor diagnosed the patient with basal cell carcinoma, a common type of skin cancer.", None],
70
+ ["West Nile virus", None],
71
+ ["Legionellosis", None],
72
+ ["Eight years ago I started with Fosamax for 3-4 years and then took Actonel. In March, I decided not to take Actonel any longer. I had been on it for too long and was fearful of esophageal cancer and bone breakage. Now my doctor wants me to take the Prolia injections, which I am not going to do. I am not going to continue with any drugs. My bone density recently done was in the minuses. I do work with a personal trainer and execise daily. I am searching for alternative ways to deal with this problem.", None],
73
+ ["Does Chicago have any stores and does Joe live here?", None],
74
+ ["Cholera has been reported every week since November 1994. By 5 November 1995 at total of 12,344 with 245 deaths have been notified. Of these, 879 cases with 4 deaths were reported for the period 9 October to 5 November 1995. Control efforts have not succeeded in preventing the spread of the epidemic and when cases were detected on Sao Nicolau and Sal Islands in the period 9 October to 5 November all nine inhabited islands of Cap Verde had become infected. The last cholera epidemic in Cap Verde occurred in 1979. (See also Weekly Epidemiological Record No. 44, 3 November 1995) Côte d'Ivoire: A cholera outbreak which started in September 1995 caused 2,027 cases and 150 deaths up to 12 November 1995. The first cases were reported in Department de l'Ouest on 18 September 1995. Cases were subsequently reported in Department de Nord and most recently in Department du Centre and Department de Sud. The WHO Representative assisted in the organization of a team to visit the area and evaluate the situation as well as arranging for medical supplies. (1.12.95) Iran, Islamic Republic of,: Kordestan Province has been declared free of cholera. (1.12.95) Iraq: An outbreak of cholera reported from Sulaimaniyah Governorate in Northern Iraq has resulted in 519 cases, 264 of which have been confirmed, and 3 deaths to date. Vibrio cholerae O1 serotype Ogawa has been isolated. At the request of the Iraqi Ministry of Health, a WHO consultant has been sent to the area to assess and monitor the situation, provide guidance to the health authorities, and coordinate inputs by non-governmental organizations. WHO has also made available essential treatment supplies. An intensive media campaign to raise public awareness about essential preventive measures has been successful in containing the spread of the outbreak. (1.12.95) Senegal: Despite the fact that cholera has been endemic in countries bordering Senegal for the past two years, no cases were reported from Senegal until mid- August 1995. Between 15 August and 17 November 1995, 852 case and 43 deaths were notified. A further 731 cases with 37 deaths have been reported for the period 1 September to 12 November. Most cases were in the Departments of Dakar and Pikine in the Dakar Region and recently also Departments of Mbacke and Touba in Diourbel Region. ", None],
75
+ ]
76
+
77
+
78
+
79
+ models_List = ["FacebookAI/xlm-roberta-large-finetuned-conll03-english", "Babelscape/wikineural-multilingual-ner", "blaze999/Medical-NER", "urchade/gliner_large-v2.1", "urchade/gliner_large_bio-v0.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1", "knowledgator/gliner-multitask-large-v0.5"
80
+ #models_List = ["Babelscape/wikineural-multilingual-ner", "blaze999/Medical-NER", "urchade/gliner_large-v2.1", "NCBO/BioPortal" ]
81
+ #models_List = ["NCBO/BioPortal" ]
82
+
83
+ #categories_List = ["MED","LOC","PER","ORG","DATE","MISC"]
84
+ categories_List = ["MED","LOC","PER","ORG","DATE","MISC", "CONC", "BIOP", "ACTI", "ANAT", "CHEM", "DEVI", "DISO", "GENE", "GEOG", "LIVB", "OBJC", "OCCU", "ORGA", "PHEN", "PHYS" , "PROC"]
85
+
86
+ POSSIBLE_KGchoices_List = ["AI", "AIO", "AEO", "BFO", "BIM", "BCGO", "CL", "CHIRO", "CHEBI", "DCM", "FMA", "GO", "GENO",
87
+ "GeoSPARQL", "HL7", "DOID", "HP", "HP_O", "IDO", "IAO", "ICD10", "LOINC", "MESH",
88
+ "MONDO", "NCIT", "NCBITAXON", "NCBITaxon_", "NIFCELL", "NIFSTD", "GML", "OBCS", "OCHV", "OHPI",
89
+ "OPB", "TRANS", "PLOSTHES", "RADLEX", "RO", "STY", "SO", "SNOMED", "STATO",
90
+ "SYMP", "FoodOn", "UBERON", "ORDO", "HOOM", "VO", "OGMS", "EuroSciVoc"]
91
+
92
+
93
+ modelGliner=None
94
+ modelGlinerBio=None
95
+
96
+ num_cores_Gliner_forDemo = 0 # 0 means use the GPU for Gliner !
97
+ tokenizerGliner = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')
98
+
99
+
100
+ encod = encoding_getter('microsoft/deberta-v3-large')
101
+ text_splitter = TokenTextSplitter(
102
+ # separators=separators,
103
+ encoding_name=encod.name,
104
+ chunk_size=80000,
105
+ chunk_overlap=50,
106
+ length_function=len,
107
+ add_start_index=True,
108
+ )
109
+
110
+ pipe_dict = {}
111
+ for modelName in models_List:
112
+ tsk = "token-classification"
113
+ if (("/gliner" in modelName) == False) and (("NCBO" in modelName) == False):
114
+ pipe = pipeline(
115
+ tsk,
116
+ model=modelName,
117
+ aggregation_strategy="simple",
118
+ device=device,
119
+ )
120
+ pipe_dict[modelName] = pipe
121
+ elif ("/gliner" in modelName):
122
+ if not tokenizerGliner:
123
+ tokenizerGliner = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')
124
+ if "_bio-" in modelName:
125
+ if num_cores_Gliner_forDemo > 0:
126
+ modelGlinerBio = GLiNER.from_pretrained(modelName) # "urchade/gliner_large_bio-v0.1")
127
+ else:
128
+ modelGlinerBio = GLiNER.from_pretrained(modelName, map_location=device)
129
+ else:
130
+ if num_cores_Gliner_forDemo > 0:
131
+ modelGliner = GLiNER.from_pretrained(
132
+ modelName) # "knowledgator/gliner-multitask-large-v0.5" - "urchade/gliner_large-v2.1"
133
+ else:
134
+ modelGliner = GLiNER.from_pretrained(modelName, map_location=device)
135
+
136
+
137
+ #### GPT@JRC API
138
+ #if args.service_provider == "gptjrc":
139
+ key_gptjrc = ""
140
+ fkeyname = "GPTJRC-APItoken.key"
141
+ if os.path.exists(fkeyname):
142
+ with open(fkeyname) as f:
143
+ key_gptjrc = f.read()
144
+ else:
145
+ key_gptjrc = os.environ['key_gptjrc']
146
+ setup_gptjrc(key_gptjrc)
147
+ #####
148
+
149
+
150
+ # Add this function to handle dropdown selection
151
+ def get_urls(word, df_annotated_combined):
152
+ # Filter the DataFrame to get rows where 'ALLURIScontext' is not empty or None
153
+ #valid_entries = df_annotated_combined[df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [])]
154
+ valid_entries = df_annotated_combined[df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [] and (
155
+ isinstance(x, list) and (isinstance(x, list) and len(x) > 0) and (not (len(x) == 1 and not str(x[0]).strip()))))]
156
+
157
+ # Check if the word is in the filtered DataFrame
158
+ if word in valid_entries['word'].values:
159
+ urls = valid_entries.loc[valid_entries['word'] == word, 'ALLURIScontext'].values[0]
160
+
161
+ if 'namedEntity' in df_annotated_combined.columns:
162
+ firsturlinlist = df_annotated_combined.loc[df_annotated_combined['word'] == word, 'namedEntity']
163
+ firsturlinlist = firsturlinlist.iloc[0] if not firsturlinlist.empty else None
164
+ if firsturlinlist and firsturlinlist in urls:
165
+ # Remove the URL from its current position
166
+ urls.remove(firsturlinlist)
167
+ # Insert the URL at the first position
168
+ urls.insert(0, firsturlinlist)
169
+
170
+ html_links = "<br>".join([f'<a href="https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={url}" target="_blank">{url}</a>' for url in urls])
171
+ #html_links = "<br>".join([f'<a href="https://api-vast.jrc.service.ec.europa.eu/describe//?url={url}" target="_blank">{url}</a>' for url in urls])
172
+ return html_links
173
+ return ""
174
+
175
+
176
+
177
+
178
+ ###@mem.cache
179
+ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking, KGchoices, state: dict):
180
+
181
+ if EntityLinking:
182
+ EnableNEL="True"
183
+ else:
184
+ EnableNEL="False"
185
+
186
+ if not text:
187
+ html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
188
+ state = {
189
+ "text": "",
190
+ "df_annotated_dict": dict(),
191
+ "KGchoices": KGchoices
192
+ }
193
+ return {"text": text, "entities": []}, html_output, state, [], ""
194
+
195
+ df_annotated = pd.DataFrame()
196
+
197
+ parser = argparse.ArgumentParser()
198
+
199
+ parser.add_argument("--model_id", type=str, default=models_List[0], help="model to use")
200
+
201
+ parser.add_argument("--debug", type=str, default="True", help="set debug mode")
202
+
203
+ parser.add_argument("--source_column", type=str, default="ContextToAnnotate")
204
+
205
+ parser.add_argument("--entities_filter_threshold", type=int, default=ScoreFilt)
206
+
207
+ parser.add_argument("--SEED", type=int, default=41)
208
+ parser.add_argument("--batch_size", type=int, default=32) # 4 - 8 - 16
209
+ parser.add_argument("--num_cores_Gliner", type=int, default=num_cores_Gliner_forDemo, help="parallel processing for Gliner annotation") # 0 means use the GPU for Gliner !
210
+
211
+ parser.add_argument("--entity_linking", type=str, default=EnableNEL, help="whether to make entities linking or not")
212
+ parser.add_argument("--geonameskey_filename", type=str, default="GEONAMES-API.key", help="file location where it is stored the geonames api key")
213
+ parser.add_argument("--virtuosokey_filename", type=str, default="VIRTUOSO-dba.key", help="file location where it is stored the virtuoso endpoint dba pwd")
214
+ parser.add_argument("--bioportalkey_filename", type=str, default="NCBO-BioPortal.key", help="file location where it is stored the NCBO BioPortal api key")
215
+
216
+ # consose 20250205:
217
+ # KGchoices = None
218
+ # KGchoices = ['SNOMED', 'LOINC', 'ICD10', 'NCIT']
219
+ # KGchoices = ['SNOMED', 'LOINC', 'ICD10', 'MESH', 'NCIT'] # restricts the input to these values only
220
+ if KGchoices:
221
+ KGchoices.sort()
222
+ parser.add_argument("--KG_restriction", nargs='+', choices=KGchoices, default=KGchoices,
223
+ help="List of ontologies to which restrict the entity linking task.")
224
+ #consose 20250502:
225
+ if Counter(KGchoices) == Counter(POSSIBLE_KGchoices_List):
226
+ parser.add_argument("--USE_CACHE", type=str, default="False",
227
+ help="whether to use cache for the NER and NEL tasks or not")
228
+ else:
229
+ #print("Lists do not have the same elements")
230
+ parser.add_argument("--USE_CACHE", type=str, default="False",
231
+ help="whether to use cache for the NER and NEL tasks or not")
232
+
233
+ parser.add_argument("--num_cores_eLinking", type=int, default=10, help="parallel processing for the entity linking process")
234
+
235
+ parser.add_argument("--computeEntityContext", type=str, default="False",
236
+ help="whether to extract a readable context from the extracted triples for the concept")
237
+ parser.add_argument("--computeEntityGlobalContext", type=str, default="False",
238
+ help="whether to extract a readable context from the extracted triples of all the entities extracted from the endpoint for the concept")
239
+ parser.add_argument("--UseRetrieverForContextCreation", type=str, default="True",
240
+ help="whether to use a retriever for the creation of the context of the entities from the triples coming from the KGs")
241
+
242
+ parser.add_argument("--service_provider", type=str, default="gptjrc", help="llm service provider")
243
+ parser.add_argument("--model_name", type=str, default="llama-3.1-70b-instruct", help="llm to use")
244
+ parser.add_argument("--tokens_max", type=int, default=80000, help="max number of tokens to supply to the llm")
245
+
246
+ parser.add_argument("--temperature", type=int, default=0.01)
247
+
248
+
249
+ args = parser.parse_args()
250
+
251
+ df_ToAnnotate = pd.DataFrame()
252
+
253
+ previous_text = ""
254
+ previous_df_annotated_dict = dict()
255
+ previous_kg_choices = []
256
+ if state:
257
+ previous_text = state.get("text", "")
258
+ previous_df_annotated_dict = state.get("df_annotated_dict", {})
259
+ previous_kg_choices = state.get("KGchoices", [])
260
+
261
+ #print("Are all models in any row of the 'model' column, case-insensitively?", all_models_in_any_row)
262
+ #if (not history_dict) or (history_dict[args.source_column][0] != text) or (all_models_in_any_row == False):
263
+ #if (not history_dict) or (history_dict[args.source_column][0] != text):
264
+ if (not previous_df_annotated_dict) or (previous_text != text) or (sorted(previous_kg_choices) != sorted(KGchoices) ):
265
+
266
+ for model_id in models_List: # always do all the annotations, only filter them afterwards
267
+ #for model_id in ModelsSelection:
268
+
269
+ # if history_dict and (history_dict[args.source_column][0] == text):
270
+ # if model_id in hhist['model'].unique():
271
+ # continue
272
+
273
+ parser.set_defaults(model_id=model_id)
274
+
275
+ args = parser.parse_args()
276
+
277
+ print("ARGS:")
278
+ print(args)
279
+
280
+ # %% n machine learning tasks, particularly when dealing with models that have stochasticity involved (like text generation), it's important to set seeds for random number generators to ensure reproducibility of results. In the case of using models from the transformers library, you need to set seeds for both Python's random module, NumPy, and PyTorch to ensure that the results are the same every time you run the code.
281
+ # Before you create the pipeline and run the text generation, set the seeds like this:
282
+ random.seed(args.SEED)
283
+ np.random.seed(args.SEED)
284
+ torch.manual_seed(args.SEED)
285
+ torch.cuda.manual_seed_all(args.SEED)
286
+ ###
287
+
288
+ df_ToAnnotate = pd.DataFrame({ "ToLink": [None], args.source_column: [text]})
289
+
290
+ if "SentenceRef" not in df_ToAnnotate.columns:
291
+ df_ToAnnotate["SentenceRef"] = None
292
+ df_ToAnnotate = df_ToAnnotate[['SentenceRef'] + [col for col in df_ToAnnotate.columns if
293
+ col != 'SentenceRef']] # this moves it to the first position
294
+
295
+ df_ToAnnotate['SentenceRef'] = df_ToAnnotate.index + 1
296
+ df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].argsort().groupby(df_ToAnnotate[args.source_column]).transform('min').astype(int)
297
+ df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].rank(method='dense').astype(int)
298
+
299
+ # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
300
+ # if strtobool(args.debug):
301
+ # print(f"Device: {device}...")
302
+ # if device.type == "cuda":
303
+ # print("GPU number:", torch.cuda.current_device())
304
+
305
+ pipeToUse = None
306
+ if (("gliner" in args.model_id) == False) and (("NCBO" in args.model_id)== False) :
307
+ pipeToUse = pipe_dict[args.model_id]
308
+
309
+ new_annotations = annotate(df_ToAnnotate, args, pipeToUse, tokenizerGliner, modelGliner, modelGlinerBio, device)
310
+ if not new_annotations.empty:
311
+ if df_annotated.empty:
312
+ # If df_annotated is empty, just assign new_annotations to it
313
+ df_annotated = new_annotations
314
+ else:
315
+ # If df_annotated is not empty, concatenate new_annotations to it
316
+ df_annotated = pd.concat([df_annotated, new_annotations], ignore_index=True)
317
+
318
+ state = {
319
+ "text": text,
320
+ "df_annotated_dict": df_annotated.to_dict(),
321
+ "KGchoices": KGchoices
322
+ }
323
+
324
+ else:
325
+
326
+ print("ARGS:")
327
+ print(args)
328
+
329
+ # %% n machine learning tasks, particularly when dealing with models that have stochasticity involved (like text generation), it's important to set seeds for random number generators to ensure reproducibility of results. In the case of using models from the transformers library, you need to set seeds for both Python's random module, NumPy, and PyTorch to ensure that the results are the same every time you run the code.
330
+ # Before you create the pipeline and run the text generation, set the seeds like this:
331
+ random.seed(args.SEED)
332
+ np.random.seed(args.SEED)
333
+ torch.manual_seed(args.SEED)
334
+ torch.cuda.manual_seed_all(args.SEED)
335
+ ###
336
+
337
+ history = pd.DataFrame(previous_df_annotated_dict)
338
+ df_annotated = history.copy()
339
+
340
+ state = {
341
+ "text": text,
342
+ "df_annotated_dict": df_annotated.to_dict(),
343
+ "KGchoices": KGchoices
344
+ }
345
+
346
+
347
+ quoted_text = text.startswith('"') & text.endswith('"')
348
+ if (not df_annotated.empty) or quoted_text:
349
+
350
+ if (not df_annotated.empty):
351
+ # filter now per models selection
352
+ df_annotated = df_annotated[df_annotated['model'].str.lower().isin([model.lower() for model in ModelsSelection])]
353
+ if df_annotated.empty and quoted_text==False:
354
+ html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
355
+ return {"text": text, "entities": []}, html_output, state, [], ""
356
+
357
+ df_annotated_combined = pd.DataFrame()
358
+ if (not df_annotated.empty):
359
+ df_annotated_combined = entitiesFusion(df_annotated,args)
360
+ if df_annotated_combined.empty and quoted_text==False:
361
+ html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
362
+ return {"text": text, "entities": []}, html_output, state, [], ""
363
+ else:
364
+ if (not df_annotated.empty):
365
+ df_annotated_combined = is_cross_inside(df_annotated_combined, args, 0.999) #I cut all the cross inside with the 0.99. to avoid the linking
366
+
367
+
368
+ cache_prefix_fp = "LLMQUERYNER"
369
+ cache_nameLLMs = cache_prefix_fp + "___" + "__".join(
370
+ [args.service_provider, args.model_name, str(args.temperature)]).replace(
371
+ " ", "_") + ".json"
372
+
373
+ load_map_query_input_output = None
374
+ if strtobool(args.USE_CACHE):
375
+ if os.path.exists(cache_nameLLMs):
376
+ with open(cache_nameLLMs) as f:
377
+ load_map_query_input_output = json.load(f)
378
+ else:
379
+ load_map_query_input_output = {}
380
+
381
+ ### entity linking part:
382
+ if strtobool(args.entity_linking):
383
+
384
+ cache_map_geonames = None
385
+ if strtobool(args.USE_CACHE):
386
+ cache_filename = "CACHE_geonames.json"
387
+ if os.path.exists(cache_filename):
388
+ with open(cache_filename) as f:
389
+ cache_map_geonames = json.load(f)
390
+ else:
391
+ cache_map_geonames = {}
392
+
393
+ key_geonames = ""
394
+ if args.geonameskey_filename:
395
+ fkeyname = args.geonameskey_filename
396
+ with open(fkeyname) as f:
397
+ key_geonames = f.read()
398
+ else:
399
+ key_geonames = os.environ['key_geonames']
400
+
401
+ cache_map_virtuoso = None
402
+ if strtobool(args.USE_CACHE):
403
+ cacheVirtuoso_filename = "CACHE_virtuoso.json"
404
+ if os.path.exists(cacheVirtuoso_filename):
405
+ with open(cacheVirtuoso_filename) as f:
406
+ cache_map_virtuoso = json.load(f)
407
+ else:
408
+ cache_map_virtuoso = {}
409
+
410
+ key_virtuoso = ""
411
+ if args.virtuosokey_filename:
412
+ fkeyname = args.virtuosokey_filename
413
+ with open(fkeyname) as f:
414
+ key_virtuoso = f.read()
415
+ else:
416
+ key_virtuoso = os.environ['key_virtuoso']
417
+
418
+
419
+ # Here for the EXACT MATCHING "" - if the desired term has not been identified in the NER, add to the dataframe:
420
+
421
+ if df_ToAnnotate.empty:
422
+ df_ToAnnotate = pd.DataFrame({"ToLink": [None], args.source_column: [text]})
423
+
424
+ if "SentenceRef" not in df_ToAnnotate.columns:
425
+ df_ToAnnotate["SentenceRef"] = None
426
+ df_ToAnnotate = df_ToAnnotate[['SentenceRef'] + [col for col in df_ToAnnotate.columns if
427
+ col != 'SentenceRef']] # this moves it to the first position
428
+
429
+ df_ToAnnotate['SentenceRef'] = df_ToAnnotate.index + 1
430
+ df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].argsort().groupby(
431
+ df_ToAnnotate[args.source_column]).transform('min').astype(int)
432
+ df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].rank(method='dense').astype(int)
433
+
434
+ # Define the condition to find missing SentenceRefs
435
+ missing_sentence_refs = ~df_ToAnnotate['SentenceRef'].isin(df_annotated_combined['SentenceRef'])
436
+
437
+ # Define the condition to check if ContextToAnnotate starts and ends with quotes
438
+ quoted_context = df_ToAnnotate[args.source_column].str.startswith('"') & df_ToAnnotate[
439
+ args.source_column].str.endswith('"')
440
+
441
+ # Combine both conditions
442
+ condition = missing_sentence_refs & quoted_context
443
+
444
+ # Select rows from df_ToAnnotate that meet the condition
445
+ rows_to_add = df_ToAnnotate[condition]
446
+
447
+ rows_to_add['model'] = "Forced"
448
+ rows_to_add['entity_group'] = "MISC"
449
+ rows_to_add['word'] = rows_to_add[args.source_column]
450
+ rows_to_add['word'] = rows_to_add[args.source_column].apply(strip_quotes)
451
+ rows_to_add['score'] = 1.0
452
+ rows_to_add['start'] = int(1)
453
+ rows_to_add['end'] = rows_to_add['word'].apply(len) + int(1)
454
+ rows_to_add['IsGeo'] = None
455
+ rows_to_add['IsBio'] = None
456
+ rows_to_add['IsCrossInside'] = 0.0
457
+
458
+ if df_annotated_combined.empty:
459
+ df_annotated_combined = pd.DataFrame(columns=df_ToAnnotate.columns)
460
+
461
+ # Append these rows to df_annotated_combined
462
+ df_annotated_combined = pd.concat([df_annotated_combined, rows_to_add], ignore_index=True)
463
+
464
+ df_annotated_combined['start'] = df_annotated_combined['start'].astype(int)
465
+ df_annotated_combined['end'] = df_annotated_combined['end'].astype(int)
466
+
467
+ df_annotated_combined = df_annotated_combined.sort_values(
468
+ by=['SentenceRef', 'start', 'ToLink', 'word', 'score'],
469
+ ascending=[True, True, True, True, False])
470
+
471
+ # Now df_annotated_combined contains the additional rows
472
+
473
+ df_annotated_combined, cache_map_geonames_AFTER, cache_map_virtuoso_AFTER, load_map_query_input_output_AFTER = elinking(df_annotated_combined,
474
+ text_splitter, args, key_geonames,
475
+ cache_map_geonames,
476
+ key_virtuoso,
477
+ cache_map_virtuoso,
478
+ load_map_query_input_output,
479
+ device)
480
+
481
+ if strtobool(args.USE_CACHE):
482
+ if cache_map_geonames_AFTER is not None:
483
+ with open(cache_filename, "w") as f:
484
+ json.dump(cache_map_geonames_AFTER, f)
485
+
486
+ if cache_map_virtuoso_AFTER is not None:
487
+ with open(cacheVirtuoso_filename, "w") as f:
488
+ json.dump(cache_map_virtuoso_AFTER, f)
489
+
490
+ if load_map_query_input_output_AFTER is not None:
491
+ with open(cache_nameLLMs, "w") as f:
492
+ json.dump(load_map_query_input_output_AFTER, f)
493
+
494
+ ### end entity linking part
495
+
496
+
497
+ ### filter by selected category only
498
+ # #df_annotated_combined = df_annotated_combined[df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in CategoriesSelection])]
499
+ # if "MED" in CategoriesSelection:
500
+ # filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
501
+ # [cat.lower() for cat in CategoriesSelection]) | (df_annotated_combined['IsBio'] == 1)
502
+ # else:
503
+ # filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
504
+ # [cat.lower() for cat in CategoriesSelection])
505
+ # df_annotated_combined = df_annotated_combined[filter_mask]
506
+ #
507
+ # if "MED" in CategoriesSelection:
508
+ # filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
509
+ # [cat.lower() for cat in CategoriesSelection]) | (df_annotated_combined['IsBio'] == 1)
510
+ # elif "OTHER" in CategoriesSelection:
511
+ # filter_mask = ~(
512
+ # df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List]))
513
+ # else:
514
+ # filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
515
+ # [cat.lower() for cat in CategoriesSelection])
516
+
517
+ filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
518
+ [cat.lower() for cat in CategoriesSelection])
519
+ if "MED" in CategoriesSelection:
520
+ filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) & (df_annotated_combined['IsBio'] == 1)
521
+ if "MISC" in CategoriesSelection:
522
+ # filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List]))
523
+ # filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) & ~(df_annotated_combined['IsBio'] == 1) # with this cluase, I'm including not only the categories labelled as MISC, but also the other that are not MED, PER, ORG, LOC
524
+ filter_mask |= ~(
525
+ df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) & ~(
526
+ df_annotated_combined[
527
+ 'IsBio'] == 1) # with this cluase, I'm including not only the categories labelled as MISC, but also the other that are not MED, PER, ORG, LOC
528
+
529
+ df_annotated_combined = df_annotated_combined[filter_mask]
530
+ if df_annotated_combined.empty:
531
+ html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
532
+ return {"text": text, "entities": []}, html_output, state, [], ""
533
+
534
+ ###
535
+
536
+ #df_annotated_combined = is_cross_inside(df_annotated_combined, args)
537
+
538
+ if 'IsCrossInside' in df_annotated_combined.columns:
539
+ df_annotated_combined = df_annotated_combined[df_annotated_combined['IsCrossInside'] != 1]
540
+ if df_annotated_combined.empty:
541
+ html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
542
+ return {"text": text, "entities": []}, html_output, state, [], ""
543
+
544
+ dict_annotated_combined_NER = df_annotated_combined[["end", "entity_group", "score", "start", "word"]].to_dict(orient="records")
545
+
546
+ ### continue linking part:
547
+ if strtobool(args.entity_linking):
548
+ # ##### this is to pass the links:
549
+
550
+ # # Create a new column for the entities with links
551
+ df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
552
+ lambda row: (
553
+ f"<a href='https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={row['namedEntity']}' target='_blank'>{row['word']}</a>"
554
+ if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[
555
+ 'word']
556
+ ),
557
+ #lambda row: (
558
+ # f"<a href='https://api-vast.jrc.service.ec.europa.eu/describe//?url={row['namedEntity']}' target='_blank'>{row['word']}</a>"
559
+ # if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[
560
+ # 'word']
561
+ #),
562
+ axis=1
563
+ )
564
+
565
+ # Create a new dictionary with the entity information and the link
566
+ dict_annotated_combined_NEL = df_annotated_combined[
567
+ ["end", "entity_group", "score", "start", "entity_with_link"]].to_dict(orient="records")
568
+
569
+ # Sort the entities by their start index
570
+ dict_annotated_combined_NEL.sort(key=lambda x: x['start'])
571
+
572
+ # Create a dictionary to map entity groups to colors
573
+ entity_colors = {
574
+ "MED": "#E6E6E6",
575
+ "PER": "#FFC0CB",
576
+ "ORG": "#C6F4D6",
577
+ "LOC": "#FFFFCC",
578
+ "MISC": "#F5DEB3"
579
+ }
580
+
581
+ text_with_links = text
582
+ offset = 0
583
+ for entity in dict_annotated_combined_NEL:
584
+ start = entity["start"] + offset
585
+ end = entity["end"] + offset
586
+ entity_text = entity["entity_with_link"]
587
+ text_with_links = text_with_links[:start] + entity_text + text_with_links[end:]
588
+ offset += len(entity_text) - (end - start)
589
+
590
+ # # Create the text with entities highlighted and linked
591
+ # text_with_links = text
592
+ # offset = 0
593
+ # for entity in dict_annotated_combined_NEL:
594
+ # start = entity["start"] + offset
595
+ # end = entity["end"] + offset
596
+ # entity_text = entity["entity_with_link"]
597
+ # entity_group = entity["entity_group"]
598
+ #
599
+ # color = entity_colors.get(entity_group, "#dbeafe") # Default
600
+ # darker_color = "#008080"
601
+ #
602
+ # if "https:" in entity_text:
603
+ # text_with_links = text_with_links[
604
+ # :start] + f'<span style="background-color: {color}; border-radius: 2px; padding: 2px 4px"><a style="color: {darker_color}" href="{entity_text.split(">")[1].split("<")[0]}">{entity_text.split(">")[1].split("<")[0]}</a> <span style="color: {darker_color}; font-size: 0.8em">{entity_group}</span></span>' + text_with_links[
605
+ # end:]
606
+ # offset += len(
607
+ # f'<span style="background-color: {color}; border-radius: 2px; padding: 2px 4px"><a style="color: {darker_color}" href="{entity_text.split(">")[1].split("<")[0]}">{entity_text.split(">")[1].split("<")[0]}</a> <span style="color: {darker_color}; font-size: 0.8em">{entity_group}</span></span>') - (
608
+ # end - start)
609
+ # # text_with_links = text_with_links[:start] + f'<span style="background-color: {color}"><a href="{entity_text.split(">")[1].split("<")[0]}">{entity_text.split(">")[1].split("<")[0]}</a></span>' + text_with_links[end:]
610
+ # # offset += len(
611
+ # # f'<span style="background-color: {color}"><a href="{entity_text.split(">")[1].split("<")[0]}">{entity_text.split(">")[1].split("<")[0]}</a></span>') - (
612
+ # # end - start)
613
+ # #
614
+ # # text_with_links = text_with_links[:start] + entity_text + text_with_links[end:]
615
+ # # offset += len(entity_text) - (end - start)
616
+ # else:
617
+ # text_with_links = text_with_links[
618
+ # :start] + f'<span style="background-color: {color}; border-radius: 2px; padding: 2px 4px">{entity_text} <span style="color: {darker_color}; font-size: 0.8em">{entity_group}</span></span>' + text_with_links[end:]
619
+ # offset += len(
620
+ # f'<span style="background-color: {color}; border-radius: 2px; padding: 2px 4px">{entity_text} <span style="color: {darker_color}; font-size: 0.8em">{entity_group}</span></span>') - (
621
+ # end - start)
622
+ # # text_with_links = text_with_links[
623
+ # # :start] + f'<span style="background-color: {color}">{entity_text}</span>' + text_with_links[
624
+ # # end:]
625
+ # # offset += len(
626
+ # # f'<span style="background-color: {color}">{entity_text}</span>') - (end - start)
627
+
628
+
629
+ # Update state with the DataFrame
630
+ state["df_annotated_combined_dict"] = df_annotated_combined.to_dict()
631
+
632
+ if 'ALLURIScontext' in df_annotated_combined.columns:
633
+ # words_for_dropdown = df_annotated_combined[
634
+ # df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [])][
635
+ # 'word'].unique().tolist()
636
+ words_for_dropdown = df_annotated_combined[df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [] and (isinstance(x, list) and len(x) > 0) and (isinstance(x, list) and (not (len(x) == 1 and not str(x[0]).strip())) ))]['word'].unique().tolist()
637
+ words_for_dropdown = list({entry.lower(): entry for entry in words_for_dropdown}.values())
638
+ words_for_dropdown.insert(0, "")
639
+ else:
640
+ words_for_dropdown = []
641
+
642
+ html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text_with_links}</div>"
643
+
644
+ #return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state
645
+ return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state, gr.update(choices=words_for_dropdown), ""
646
+
647
+ else:
648
+ html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
649
+ return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state, [], ""
650
+
651
+ else:
652
+
653
+ html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
654
+ return {"text": text, "entities": []}, html_output, state, [], ""
655
+
656
+
657
+ # "FacebookAI/xlm-roberta-large-finetuned-conll03-english", "Babelscape/wikineural-multilingual-ner", "blaze999/Medical-NER", "urchade/gliner_large-v2.1", "urchade/gliner_large_bio-v0.1"
658
+
659
+
660
+ def update_urls(selected_word, state):
661
+ if "df_annotated_combined_dict" in state:
662
+ # Convert the state dictionary back into a DataFrame
663
+ df = pd.DataFrame(state["df_annotated_combined_dict"])
664
+
665
+ # # Filter the DataFrame to get rows where 'ALLURIScontextFromNCBO' is not empty or None
666
+ # valid_entries = df[df['ALLURIScontext'].apply(lambda x: x is not None and x != [])]
667
+ # # Filter the DataFrame to get rows where 'ALLURIScontext' is not None, not an empty list, and not an empty string
668
+ valid_entries = df[df['ALLURIScontext'].apply(lambda x: x is not None and x != [] and (isinstance(x, list) and len(x) > 0) and (isinstance(x, list) and (not (len(x) == 1 and not str(x[0]).strip())) ))]
669
+
670
+ # Check if the selected word is in the filtered DataFrame
671
+ if selected_word in valid_entries['word'].values:
672
+ urls = valid_entries.loc[valid_entries['word'] == selected_word, 'ALLURIScontext'].values[0]
673
+ if 'namedEntity' in df.columns:
674
+ firsturlinlist = df.loc[df['word'] == selected_word, 'namedEntity']
675
+ firsturlinlist = firsturlinlist.iloc[0] if not firsturlinlist.empty else None
676
+ if firsturlinlist and firsturlinlist in urls:
677
+ # Remove the URL from its current position
678
+ urls.remove(firsturlinlist)
679
+ # Insert the URL at the first position
680
+ urls.insert(0, firsturlinlist)
681
+
682
+ # Convert list of URLs to HTML string with clickable links
683
+ html_links = "<br>".join([f'<a href="https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={url}" target="_blank">{url}</a>' for url in urls])
684
+ #html_links = "<br>".join([f'<a href="https://api-vast.jrc.service.ec.europa.eu/describe//?url={url}" target="_blank">{url}</a>' for url in urls])
685
+ return html_links
686
+ return ""
687
+
688
+ else:
689
+ return ""
690
+
691
+
692
+ # demo = gr.Interface(
693
+ # fn=nerBio,
694
+ # inputs=[
695
+ # gr.Textbox(label= "Input text", placeholder="Enter text here..."),
696
+ # gr.CheckboxGroup(models_List, label="ModelsSelection", value=models_List),
697
+ # gr.CheckboxGroup(categories_List, label="CategoriesSelection", value=categories_List),
698
+ # gr.Slider(minimum=0, maximum=1.0, step=0.1, label="Score", value=0.7),
699
+ # gr.Checkbox(label="Enable Named-Entity Linking (NEL)", value=False), #True False
700
+ # #gr.CheckboxGroup(POSSIBLE_KGchoices_List, label="KGchoices Selection", value=POSSIBLE_KGchoices_List, visible=True),
701
+ # gr.Dropdown(POSSIBLE_KGchoices_List, multiselect=True, label="KGchoices Selection", value=POSSIBLE_KGchoices_List),
702
+ # gr.State(value={})
703
+ # ],
704
+ # outputs=[
705
+ # gr.HighlightedText(label="Annotated Text"),
706
+ # gr.HTML(label="Linked Text", show_label=True, visible=True), # use gr.HTML to render the annotated text with links , visible
707
+ # gr.State(),
708
+ # gr.Dropdown(label="Annotated Concepts", interactive=True,visible=True),
709
+ # gr.Textbox(label="Linked Entities",interactive=False,visible=True)
710
+ # ],
711
+ # live=True,
712
+ # title="BioAnnotator: Biomedical Named-Entity Recognition (NER) and Linking (NEL)",
713
+ # description="""Interoperability – the capability of systems and organisations to cooperate across functional, sectoral and physical borders – is key for successful digital transformation.
714
+ # The [Interoperable Europe Act](https://interoperable-europe.ec.europa.eu/interoperable-europe/interoperable-europe-act) is an EU regulation that aims to strengthen public sector interoperability and will serve as a main EC policy framework for the years to come.
715
+ # Data exchange is vital for digital government policies, and semantic interoperability ensures systems understand each other despite different legacies and architectures.
716
+ #
717
+ # In this demo we show in particular the *BioAnnotator*, a prototype tool performing Biomedical Named-Entity Recognition (NER) and Linking (NEL). To give it a try, please select one or more NER models and enter some text to get it processed. Please select also the entity categories you want to extract, as well as the score to use as a threshold for the NER extraction. Finally, select whether you want to perform Named-Entity Linking (NEL) and if you want to enable the filtering to some specific biomedical ontologies only (acronyms description at: https://bioportal.bioontology.org/ontologies). See also: [InventoryHealthKGs.pdf](https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/ETOHA/KGs/InventoryHealthKGs.pdf).
718
+ # """,
719
+ # examples=examples,
720
+ # cache_examples=False,
721
+ # article="""
722
+ # **Categories Legend:**
723
+ # - MED | Medical
724
+ # - LOC | Locations
725
+ # - PER | Persons
726
+ # - ORG | Organizations
727
+ # - MISC | Miscellanea
728
+ # - CONC | Concepts & Ideas
729
+ # - BIOP | Biological
730
+ # - ACTI | Activities & Behaviors
731
+ # - ANAT | Anatomy
732
+ # - CHEM | Chemicals & Drugs
733
+ # - DEVI | Devices
734
+ # - DISO | Disorders
735
+ # - GENE | Genes & Molecular Sequences
736
+ # - GEOG | Geographic Areas
737
+ # - LIVB | Living Beings
738
+ # - OBJC | Objects
739
+ # - OCCU | Occupations
740
+ # - ORGA | Organizations
741
+ # - PHEN | Phenomena
742
+ # - PHYS | Physiology
743
+ # - PROC | Procedures
744
+ # """
745
+ # )
746
+
747
+
748
+ # Define the Gradio interface using Blocks
749
+ #description="This application performs biomedical named-entity recognition and linking."
750
+ with gr.Blocks(title="BioAnnotator") as demo:
751
+
752
+ gr.Markdown("# BioAnnotator: Biomedical Named-Entity Recognition (NER) and Linking (NEL)")
753
+ gr.Markdown("""
754
+ This application performs biomedical named-entity recognition and linking.
755
+
756
+ **Description:**
757
+ *Interoperability* – the capability of systems and organisations to cooperate across functional, sectoral and physical borders – is key for successful digital transformation.
758
+ The [Interoperable Europe Act](https://interoperable-europe.ec.europa.eu/interoperable-europe/interoperable-europe-act) is an EU regulation that aims to strengthen public sector interoperability and will serve as a main EC policy framework for the years to come.
759
+ Data exchange is vital for digital government policies, and semantic interoperability ensures systems understand each other despite different legacies and architectures.
760
+
761
+ In this demo we show in particular the *BioAnnotator*, a prototype tool performing Biomedical Named-Entity Recognition (NER) and Linking (NEL). To give it a try, please select one or more NER models and enter some text to get it processed. Please select also the entity categories you want to extract, as well as the score to use as a threshold for the NER extraction. Finally, select whether you want to perform Named-Entity Linking (NEL) and if you want to enable the filtering to some specific biomedical ontologies only (acronyms description at: https://bioportal.bioontology.org/ontologies). See also: [InventoryHealthKGs.pdf](https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/ETOHA/KGs/InventoryHealthKGs.pdf).
762
+ """)
763
+
764
+
765
+ with gr.Row():
766
+ with gr.Column():
767
+ text_input = gr.Textbox(label="Input text", placeholder="Enter text here...")
768
+ models_selection = gr.CheckboxGroup(models_List, label="ModelsSelection", value=models_List)
769
+ categories_selection = gr.CheckboxGroup(categories_List, label="CategoriesSelection", value=categories_List)
770
+ score_slider = gr.Slider(minimum=0, maximum=1.0, step=0.1, label="Score", value=0.7)
771
+ nel_checkbox = gr.Checkbox(label="Enable Named-Entity Linking (NEL)", value=False)
772
+ kgchoices_selection = gr.Dropdown(POSSIBLE_KGchoices_List, multiselect=True, label="KGchoices Selection", value=POSSIBLE_KGchoices_List)
773
+ state = gr.State(value={})
774
+
775
+ with gr.Column():
776
+ annotated_text = gr.HighlightedText(label="Annotated Text")
777
+ linked_text = gr.HTML(label="Linked Text", show_label=True, visible=True)
778
+ word_dropdown = gr.Dropdown(label="Annotated Concepts", show_label=True, visible=True, interactive=True)
779
+ urls_html = gr.HTML(label="Linked Entities", show_label=True, visible=True)
780
+
781
+ ## Define the interactions
782
+ #text_input.change(fn=nerBio, inputs=[text_input, models_selection, categories_selection, score_slider, nel_checkbox, kgchoices_selection, state], outputs=[annotated_text, linked_text, state, word_dropdown, urls_html])
783
+ # Define the interactions for all inputs
784
+ inputs = [text_input, models_selection, categories_selection, score_slider, nel_checkbox, kgchoices_selection]
785
+ for input_component in inputs:
786
+ input_component.change(fn=nerBio,
787
+ inputs=[text_input, models_selection, categories_selection, score_slider, nel_checkbox,
788
+ kgchoices_selection, state],
789
+ outputs=[annotated_text, linked_text, state, word_dropdown, urls_html])
790
+
791
+ word_dropdown.change(fn=update_urls, inputs=[word_dropdown, state], outputs=urls_html)
792
+
793
+ # Add examples
794
+ gr.Examples(examples=examples,
795
+ inputs=[text_input, models_selection, categories_selection, score_slider, nel_checkbox,
796
+ kgchoices_selection])
797
+
798
+ gr.Markdown("""
799
+ **Categories Legend:**
800
+ - MED | Medical
801
+ - LOC | Locations
802
+ - PER | Persons
803
+ - ORG | Organizations
804
+ - MISC | Miscellanea
805
+ - CONC | Concepts & Ideas
806
+ - BIOP | Biological
807
+ - ACTI | Activities & Behaviors
808
+ - ANAT | Anatomy
809
+ - CHEM | Chemicals & Drugs
810
+ - DEVI | Devices
811
+ - DISO | Disorders
812
+ - GENE | Genes & Molecular Sequences
813
+ - GEOG | Geographic Areas
814
+ - LIVB | Living Beings
815
+ - OBJC | Objects
816
+ - OCCU | Occupations
817
+ - ORGA | Organizations
818
+ - PHEN | Phenomena
819
+ - PHYS | Physiology
820
+ - PROC | Procedures
821
+ """)
822
+
823
+
824
+
825
+ demo.launch()
826
+ #demo.launch(share=True) # Share your demo with just 1 extra parameter
common.py CHANGED
@@ -5,11 +5,12 @@ import numpy as np
5
  import tiktoken
6
  from langchain.text_splitter import TokenTextSplitter
7
 
 
8
  def strip_quotes(text):
9
  if text.startswith('"') and text.endswith('"'):
10
  return text[1:-1]
11
  return text
12
-
13
  def strtobool(val):
14
  val = val.lower()
15
  if val in ('yes', 'true', 't', '1'):
 
5
  import tiktoken
6
  from langchain.text_splitter import TokenTextSplitter
7
 
8
+ # Function to cleanly strip quoted strings
9
  def strip_quotes(text):
10
  if text.startswith('"') and text.endswith('"'):
11
  return text[1:-1]
12
  return text
13
+
14
  def strtobool(val):
15
  val = val.lower()
16
  if val in ('yes', 'true', 't', '1'):
llmqueryNer.py ADDED
@@ -0,0 +1,881 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ import openai
5
+ import json
6
+ import time
7
+ from tqdm import tqdm
8
+
9
+ import logging
10
+
11
+ from functools import partial
12
+ import pandas as pd
13
+
14
+ import tiktoken
15
+ from langchain.text_splitter import TokenTextSplitter
16
+
17
+ import argparse
18
+
19
+ from common import cleanInputText, encoding_getter, tokenizer, token_counter
20
+
21
+ #from llmqueryHF import api_call_HFonPremises
22
+
23
+ #from dgl_client.api_cli import APIClient, InferenceClient
24
+
25
+ #DGL_API_ENDPOINT = "https://www.diglife.eu/inference"
26
+ #client_dglc = InferenceClient(backend_url=DGL_API_ENDPOINT)
27
+
28
+ fkeyname = "GPTJRC-APItoken.key"
29
+ key_gptjrc=""
30
+ if os.path.exists(fkeyname):
31
+ with open(fkeyname) as f:
32
+ key_gptjrc = f.read()
33
+ else:
34
+ key_gptjrc = os.environ['key_gptjrc']
35
+ clientGPTJRC = openai.OpenAI(api_key=key_gptjrc, base_url="https://api-gpt.jrc.ec.europa.eu/v1")
36
+
37
+
38
+ """
39
+ query LLM API end point on list of text, seamlessly
40
+
41
+ features:
42
+ - build in retry in case of error
43
+ - cache the results in case of crash
44
+ - call LLM with a lambda or as regular function call
45
+
46
+ supported API:
47
+ - OpenAI
48
+ - GPT@JRC
49
+ - F7 (DigLife)
50
+
51
+ issues:
52
+ - the cache is written after each succesfull call, could results in slowdown for large dataset
53
+ - for the moment deals only with openai's rate limit error, all other error will result in crash
54
+ """
55
+
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+
64
+
65
+ # ### OPENAI API
66
+
67
+ def setup_openai(org=None, key=None):
68
+ if org is not None:
69
+ openai.organization = org
70
+ # else:
71
+ # openai.organization = os.getenv("OPENAI_API_ORGANIZATION")
72
+
73
+ if key is not None:
74
+ openai.api_key = key
75
+ else:
76
+ openai.api_key = os.getenv("OPENAI_API_KEY")
77
+ #
78
+ print(model_list_openai())
79
+
80
+ def api_call_openai(prompt: str, input_text: str, model: str, temperature: int, timeout_retry: int=5, delimiter: str = "```", InContextExamples: list[[str]] = [], debug=False, args: argparse.Namespace=None):
81
+ """ call openai API, with a retry in case of RateLimitError """
82
+
83
+ if not(prompt) or prompt.strip=="" or not(input_text) or input_text.strip=="":
84
+ logging.warning("No text or promt supplied! Skypping it!")
85
+ return None
86
+
87
+ if delimiter and len(delimiter)>0:
88
+ input_text = delimiter + input_text + delimiter
89
+
90
+ response = None
91
+
92
+ myMessages = []
93
+ if InContextExamples:
94
+ for row in InContextExamples:
95
+ myMessages.append({"role": "system", "content": prompt})
96
+ for indCol, colVal in enumerate(row):
97
+ if indCol == 0:
98
+ if delimiter and len(delimiter) > 0:
99
+ myMessages.append({"role": "user", "content": (delimiter + colVal + delimiter)})
100
+ else:
101
+ myMessages.append({"role": "user", "content": colVal})
102
+ elif indCol == 1:
103
+ myMessages.append({"role": "assistant", "content": colVal})
104
+
105
+ myMessages.append({"role": "system", "content": prompt})
106
+ myMessages.append({'role': 'user', 'content': input_text})
107
+
108
+ max_retries = 50
109
+ iteration = 1
110
+ while response is None and max_retries > 0:
111
+ try:
112
+ response = openai.ChatCompletion.create(
113
+ model=model,
114
+ # messages=[
115
+ # {"role": "system", "content": prompt},
116
+ # {'role': 'user', 'content': input_text},
117
+ # ],
118
+ messages=myMessages,
119
+ temperature=temperature,
120
+ #max_tokens=32000, #it gives error
121
+ #max_response_tokens=32000 #it gives error
122
+ )
123
+ except openai.RateLimitError as e:
124
+ response = None
125
+ max_retries = max_retries - 1
126
+ print(e)
127
+ nt = token_counter((prompt + input_text), model)
128
+ print("Model "+str(model)+" - Length of overall prompt message ", str(nt))
129
+ print("current iteration ", iteration)
130
+ print("try other ", max_retries, " times")
131
+ print("sleeping", int(iteration * timeout_retry), "s")
132
+ print(time.sleep(int(iteration * timeout_retry)))
133
+ iteration = iteration + 1
134
+ except Exception as err:
135
+ response = None
136
+ max_retries = max_retries - 1
137
+ print(err)
138
+ nt = token_counter((prompt + input_text), model)
139
+ print("Model " + str(model) + " - Length of overall prompt message ", str(nt))
140
+ print("current iteration ", iteration)
141
+ print("try other ", max_retries, " times")
142
+ print("sleeping", int(iteration*timeout_retry), "s")
143
+ print(time.sleep(int(iteration*timeout_retry)))
144
+ iteration = iteration + 1
145
+
146
+ if (response == None) and (max_retries <= 0):
147
+ print("\n")
148
+ print(prompt + input_text)
149
+ print("\n")
150
+ print("\nTried many times and did not succeed, there is something strange. Check the problem...exiting now\n")
151
+ sys.exit()
152
+
153
+ return response
154
+
155
+ def model_list_openai():
156
+ return openai.Model.list()
157
+
158
+
159
+ ### GPT@JRC API
160
+
161
+ def setup_gptjrc_formerOpenAI(token=None):
162
+ if token is None:
163
+ token=os.getenv("GPTJRC_TOKEN")
164
+ openai.organization = ""
165
+ openai.api_key = token
166
+ #openai.api_type = "open_ai"
167
+ openai.api_base = "https://api-gpt.jrc.ec.europa.eu/v1"
168
+ #
169
+ print(model_list_gptjrc())
170
+
171
+
172
+ def setup_gptjrc(token=None):
173
+ # if token is None:
174
+ # #token=os.getenv("GPTJRC_TOKEN")
175
+ # token = os.getenv("OPENAI_API_KEY")
176
+ #
177
+ # clientGPTJRC = openai.OpenAI(api_key=token, base_url = "https://api-gpt.jrc.ec.europa.eu/v1")
178
+
179
+ all_models = clientGPTJRC.models.list()
180
+ # for model in all_models:
181
+ # print(model.id)
182
+
183
+ chat_models = [model for model in all_models.data if model.model_usage == "chat"]
184
+ print(f"\nGPTJRC - Found {len(chat_models)} chat models:")
185
+ for model in chat_models:
186
+ print(" " + str(model.id))
187
+ embed_models = [model for model in all_models.data if model.model_usage != "chat"]
188
+
189
+ print(f"\nGPTJRC - Found {len(chat_models)} embedding models:")
190
+ for model in embed_models:
191
+ print(" " + str(model.id))
192
+
193
+
194
+ def api_call_gptjrc(prompt: str, input_text: str, model: str, temperature: int, timeout_retry: int=5, delimiter: str = "```", InContextExamples: list[[str]] = [], debug=False, args: argparse.Namespace=None):
195
+
196
+
197
+ if not (prompt) or prompt.strip=="" or not(input_text) or input_text.strip=="":
198
+ logging.warning("No text or promt supplied! Skypping it!")
199
+ return None
200
+
201
+ if delimiter and len(delimiter)>0:
202
+ input_text = delimiter + input_text + delimiter
203
+
204
+ response = None
205
+
206
+ myMessages = []
207
+ if InContextExamples:
208
+ for row in InContextExamples:
209
+ myMessages.append({"role": "system", "content": prompt})
210
+ for indCol, colVal in enumerate(row):
211
+ if indCol == 0:
212
+ if delimiter and len(delimiter) > 0:
213
+ myMessages.append({"role": "user", "content": (delimiter + colVal + delimiter)})
214
+ else:
215
+ myMessages.append({"role": "user", "content": colVal})
216
+ elif indCol == 1:
217
+ myMessages.append({"role": "assistant", "content": colVal})
218
+
219
+ myMessages.append({"role": "system", "content": prompt})
220
+ myMessages.append({'role': 'user', 'content': input_text})
221
+
222
+
223
+ max_retries = 50
224
+ iteration = 1
225
+ while response is None and max_retries>0:
226
+ try:
227
+ # if InContextExamples:
228
+ # response = openai.ChatCompletion.create(
229
+ # headers={"Authorization": "Bearer " + openai.api_key},
230
+ # model=model,
231
+ # messages=[
232
+ # {"role": "system", "content": prompt},
233
+ # {'role': 'user', 'content': InContextExamples[0][0]},
234
+ # {'role': 'assistant', 'content': InContextExamples[0][1]},
235
+ # {"role": "system", "content": prompt},
236
+ # {'role': 'user', 'content': InContextExamples[1][0]},
237
+ # {'role': 'assistant', 'content': InContextExamples[1][1]},
238
+ # {"role": "system", "content": prompt},
239
+ # {'role': 'user', 'content': InContextExamples[2][0]},
240
+ # {'role': 'assistant', 'content': InContextExamples[2][1]},
241
+ # {"role": "system", "content": prompt},
242
+ # {'role': 'user', 'content': input_text},
243
+ # ],
244
+ # temperature=temperature,
245
+ # # max_tokens=4000, #20000, #32000, #it gives error
246
+ # # max_response_tokens=32000 #it gives error
247
+ # )
248
+ # else:
249
+
250
+ # former OpenAI call
251
+ # response = openai.ChatCompletion.create(
252
+ # headers={"Authorization": "Bearer "+openai.api_key},
253
+ # model=model,
254
+ # # messages=[
255
+ # # {"role": "system", "content": prompt},
256
+ # # {'role': 'user', 'content': input_text},
257
+ # # ],
258
+ # messages=myMessages,
259
+ # temperature=temperature,
260
+ # #max_tokens=4000, #20000, #32000, #it gives error
261
+ # #max_response_tokens=32000 #it gives error
262
+ # )
263
+
264
+ response = clientGPTJRC.chat.completions.create(
265
+ model=model,
266
+ stream=False,
267
+ # messages=[{"role": "user", "content": "Hello!"}],
268
+ messages=myMessages,
269
+ temperature=temperature,
270
+ )
271
+
272
+ #print(response.choices[0].message.content)
273
+
274
+ except openai.RateLimitError as e:
275
+ response = None
276
+ max_retries = max_retries - 1
277
+ print(e)
278
+ nt = token_counter((prompt + input_text), model)
279
+ print("Model " + str(model) + " - Length of overall prompt message ", str(nt))
280
+ print("current iteration ", iteration)
281
+ print("try other ", max_retries, " times")
282
+ print("sleeping", int(iteration*timeout_retry), "s")
283
+ print(time.sleep(int(iteration*timeout_retry)))
284
+ iteration = iteration + 1
285
+ print("\npromt:")
286
+ print(prompt)
287
+ print("\ninput_text:")
288
+ print(input_text)
289
+ if max_retries == 45 or max_retries == 40 or max_retries == 35 or max_retries == 30 or max_retries == 25 or max_retries == 20 or max_retries == 15 or max_retries == 10 or max_retries == 5:
290
+ # input_text = input_text[0:-1000]
291
+ # input_text = input_text + delimiter
292
+ #
293
+ input_text = cleanInputText(input_text)
294
+ #
295
+ ntokens = int(token_counter(input_text, model))
296
+ if ntokens > 1000: # I split the CONTEXT if it is TOO BIG, BIGGER THAN 1000 tokens let's say
297
+ encod = encoding_getter(model)
298
+ text_splitter = TokenTextSplitter(
299
+ # separators=separators,
300
+ encoding_name=encod.name,
301
+ chunk_size=int(0.8 * ntokens),
302
+ chunk_overlap=50,
303
+ length_function=len,
304
+ add_start_index=True,
305
+ )
306
+ texts = text_splitter.create_documents([input_text])
307
+ input_text = texts[0].page_content
308
+ myMessages = []
309
+ myMessages.append({"role": "system", "content": prompt})
310
+ myMessages.append({'role': 'user', 'content': input_text})
311
+ except Exception as err:
312
+ response = None
313
+ max_retries = max_retries - 1
314
+ print(err)
315
+ nt = token_counter((prompt + input_text), model)
316
+ print("Model " + str(model) + " - Length of overall prompt message ", str(nt))
317
+ print("current iteration ", iteration)
318
+ print("try other ", max_retries, " times")
319
+ print("sleeping", int(iteration * timeout_retry), "s")
320
+ print(time.sleep(int(iteration * timeout_retry)))
321
+ iteration = iteration + 1
322
+ print("\npromt:")
323
+ print(prompt)
324
+ print("\ninput_text:")
325
+ print(input_text)
326
+ if max_retries == 45 or max_retries == 40 or max_retries == 35 or max_retries == 30 or max_retries == 25 or max_retries == 20 or max_retries == 15 or max_retries == 10 or max_retries == 5:
327
+ # input_text = input_text[0:-1000]
328
+ # input_text = input_text + delimiter
329
+ #
330
+ input_text = cleanInputText(input_text)
331
+ #
332
+ ntokens = int(token_counter(input_text, model))
333
+ if ntokens > 1000: # I split the CONTEXT if it is TOO BIG, BIGGER THAN 1000 tokens let's say
334
+ encod = encoding_getter(model)
335
+ text_splitter = TokenTextSplitter(
336
+ # separators=separators,
337
+ encoding_name=encod.name,
338
+ chunk_size=int(0.8 * ntokens),
339
+ chunk_overlap=50,
340
+ length_function=len,
341
+ add_start_index=True,
342
+ )
343
+ texts = text_splitter.create_documents([input_text])
344
+ input_text = texts[0].page_content
345
+ myMessages = []
346
+ myMessages.append({"role": "system", "content": prompt})
347
+ myMessages.append({'role': 'user', 'content': input_text})
348
+
349
+
350
+ if (response == None) and (max_retries <= 0):
351
+ print("\n")
352
+ print(prompt + input_text)
353
+ print("\n")
354
+ print("\nTried many times and did not succeed, there is something strange. Check the problem...exiting now\n")
355
+ sys.exit()
356
+
357
+ return response
358
+
359
+
360
+
361
+ def model_list_gptjrc():
362
+ return openai.Model.list()
363
+
364
+
365
+ ### DGLC API
366
+
367
+ def clean_gpt_out(output_text :str):
368
+
369
+ if "From the text below, delimited by triple quotes, extract the following items: 1 - The name of the virus that has caused the outbreak" in output_text:
370
+ print("debug")
371
+
372
+ if "<|assistant|>" in output_text:
373
+ output_text = output_text.split("<|assistant|>")[0].strip()
374
+ if "<|prompt|>" in output_text:
375
+ output_text = output_text.split("<|prompt|>")[0].strip()
376
+ if "<|prompter|>" in output_text:
377
+ output_text = output_text.split("<|prompter|>")[0].strip()
378
+ if "<|answer|>" in output_text:
379
+ output_text = output_text.split("<|answer|>")[0].strip()
380
+ if "<|im_end|>" in output_text:
381
+ output_text = output_text.split("<|im_end|>")[0].strip()
382
+ if "<|endofextract|>" in output_text:
383
+ output_text = output_text.split("<|endofextract|>")[0].strip()
384
+ if "<br>" in output_text:
385
+ output_text = output_text.split("<br>")[0].strip()
386
+
387
+ if "<|/assistant|>" in output_text:
388
+ output_text = output_text.split("<|/assistant|>")[0].strip()
389
+ if "<|/prompt|>" in output_text:
390
+ output_text = output_text.split("<|/prompt|>")[0].strip()
391
+ if "<|/prompter|>" in output_text:
392
+ output_text = output_text.split("<|/prompter|>")[0].strip()
393
+ if "<|/answer|>" in output_text:
394
+ output_text = output_text.split("<|/answer|>")[0].strip()
395
+ if "<|/im_end|>" in output_text:
396
+ output_text = output_text.split("<|/im_end|>")[0].strip()
397
+ if "<|/endofextract|>" in output_text:
398
+ output_text = output_text.split("<|/endofextract|>")[0].strip()
399
+ if "</br>" in output_text:
400
+ output_text = output_text.split("</br>")[0].strip()
401
+
402
+ if "</|assistant|>" in output_text:
403
+ output_text = output_text.split("</|assistant|>")[0].strip()
404
+ if "</|prompt|>" in output_text:
405
+ output_text = output_text.split("</|prompt|>")[0].strip()
406
+ if "</|prompter|>" in output_text:
407
+ output_text = output_text.split("</|prompter|>")[0].strip()
408
+ if "</|answer|>" in output_text:
409
+ output_text = output_text.split("</|answer|>")[0].strip()
410
+ if "</|im_end|>" in output_text:
411
+ output_text = output_text.split("</|im_end|>")[0].strip()
412
+ if "</|endofextract|>" in output_text:
413
+ output_text = output_text.split("</|endofextract|>")[0].strip()
414
+
415
+ while "```" in output_text:
416
+ output_text = output_text.replace("```", " ")
417
+
418
+ while " " in output_text:
419
+ output_text = output_text.replace(" ", " ")
420
+
421
+ return output_text
422
+
423
+
424
+ # def setup_dglc(key=None):
425
+ # if key is None:
426
+ # ACCESS_KEY = os.getenv("DGL_TOKEN")
427
+ # else:
428
+ # ACCESS_KEY=key
429
+ #
430
+ # client_dglc.login(ACCESS_KEY)
431
+ #
432
+ # #list available models
433
+ # models_available = model_list_dglc()
434
+ # print("DGLC - available models = "+str(models_available))
435
+ #
436
+ # # chat_id = client_dglc.create_chat()
437
+ # # # Or continue the previous one
438
+ # # # chat_id = client_dglc.continue_chat(args.chat_id)
439
+ # # print("\nCHAT_ID dglc", chat_id)
440
+
441
+
442
+ # def api_call_dglc(prompt: str, input_text: str, model: str, temperature: float, timeout_retry: int =5, delimiter: str = "```", InContextExamples: list[[str]] = [], debug=False, args: argparse.Namespace=None):
443
+ #
444
+ # # if model == "gpt-3.5-turbo":
445
+ # # model = "OA_GPT3.5"
446
+ #
447
+ #
448
+ # if not(input_text) or input_text.strip=="" or not(prompt) or prompt.strip=="":
449
+ # logging.warning("No text or promt supplied! Skypping it!")
450
+ # return None
451
+ #
452
+ # message = ""
453
+ # if InContextExamples:
454
+ # for row in InContextExamples:
455
+ # message = message + prompt
456
+ # for indCol, colVal in enumerate(row):
457
+ # if indCol == 0:
458
+ # if delimiter and len(delimiter) > 0:
459
+ # message = message + delimiter + colVal + delimiter
460
+ # else:
461
+ # message = message + colVal
462
+ # elif indCol == 1:
463
+ # message = message + " \n" + colVal + " \n"
464
+ #
465
+ # if delimiter and len(delimiter) > 0:
466
+ # message = prompt + delimiter + input_text + delimiter
467
+ # else:
468
+ # message = prompt + "\n" + input_text
469
+ #
470
+ # if debug:
471
+ # print("\n")
472
+ # print(message)
473
+ #
474
+ # chat_id = client_dglc.create_chat()
475
+ # # Or continue the previous one
476
+ # # chat_id = client.continue_chat(args.chat_id)
477
+ # if debug:
478
+ # print("\nCHAT_ID dglc", chat_id)
479
+ #
480
+ # try:
481
+ # events = client_dglc.send_message(message, model, temp=temperature)
482
+ # except Exception as err:
483
+ # logging.error(f'FAILED api_call_dglc WITH MESSAGE: \'{message}\' \nMODEL: {model}; \n\tError: {err}')
484
+ #
485
+ # events= None
486
+ # max_retries = 50
487
+ # iteration = 1
488
+ # while events is None and max_retries > 0:
489
+ # try:
490
+ # events = client_dglc.send_message(message, model, temp=temperature)
491
+ # except Exception as err:
492
+ # events = None
493
+ # max_retries = max_retries - 1
494
+ # print(err)
495
+ # nt = token_counter((message), model)
496
+ # print("Model " + str(model) + " - Length of overall prompt message ", str(nt))
497
+ # print("current iteration ", iteration)
498
+ # print("try other ", max_retries, " times")
499
+ # print("sleeping", int(iteration * timeout_retry), "s")
500
+ # print(time.sleep(int(iteration * timeout_retry)))
501
+ # iteration = iteration + 1
502
+ #
503
+ # if (events == None) and (max_retries <= 0):
504
+ # print("\n")
505
+ # print(message)
506
+ # print("\n")
507
+ # print("\nTried many times and did not succeed, there is something strange. Check the problem...exiting now\n")
508
+ # sys.exit()
509
+ #
510
+ # if events:
511
+ # event = [str(x) for x in events]
512
+ # # The message is streamed token by token
513
+ # # for event in events:
514
+ # # print(event, end="", flush=True)
515
+ # if event:
516
+ # event = event[-1]
517
+ # else:
518
+ # event = None
519
+ #
520
+ # if debug:
521
+ # print("\nAPI CALL ANSWER:")
522
+ # print(event)
523
+ # print("\n")
524
+ #
525
+ # else:
526
+ # event = None
527
+ #
528
+ # return event
529
+
530
+ # def model_list_dglc():
531
+ # return client_dglc.get_available_models()
532
+
533
+
534
+ ### CALLING MODELS
535
+
536
+
537
+ def call_model_with_caching(input_text: str, prompt: str, model: str, temperature: int, handler,
538
+ map_query_input_output: dict, cache_fp: str, timeout_retry: int =5, delimiter: str = "```", InContextExamples: list[[str]] = [], verbose: bool = True, args: argparse.Namespace=None):
539
+ """ call openai's API but take care of caching of results
540
+ input_text: input text
541
+ prompt: prompt
542
+ model: model name (as parameter of the query)
543
+ temperature: temperature (0: precise, 1: creative)
544
+ handler: delegate function that will make the call (not necessarily only OpenAI, could be any one)
545
+ map_query_input_output: cache dict containing already processed data
546
+ cache_fp: file to which write content of cache after each call
547
+ """
548
+
549
+ if not(input_text) or input_text.strip=="" or not(prompt) or prompt.strip=="":
550
+ logging.warning("No text or promt supplied! Skypping it!")
551
+ return None
552
+
553
+ # try to read cache
554
+
555
+ if map_query_input_output is not None:
556
+ key = model + "__" + str(temperature) + "__" + prompt
557
+
558
+ if key in map_query_input_output:
559
+ if input_text in map_query_input_output[key]:
560
+ output = map_query_input_output[key][input_text]
561
+ # if input_text.strip() == "":
562
+ # print("here")
563
+
564
+ # if handler == api_call_dglc:
565
+ # output = clean_gpt_out(output) #clean output
566
+
567
+ if verbose:
568
+ print("RETRIEVED CACHED RESULT FOR:\n", prompt, "\n", delimiter, input_text, delimiter, "\n=>\n", output, "\n")
569
+
570
+ return output
571
+
572
+ # call
573
+
574
+ response = None
575
+
576
+ try:
577
+ response = handler(prompt, input_text, model, temperature, timeout_retry, delimiter, InContextExamples, args=args)
578
+ except Exception as err:
579
+ logging.error(f'FAILED WITH PROMPT: \'{prompt}\' \nLEN_TEXT: {len(input_text)}, \nTEXT: {(input_text)}, \nMODEL: {model}; \n\tError: {err}')
580
+ #else:
581
+ # # logging.warning(f'INDEX: \'{SOURCE_INDEX}\' Inserted {inserted} rows out of {num_lines} rows [{round((inserted/num_lines)*100, 2)}%]')
582
+ # break
583
+
584
+ if response:
585
+ if isinstance(response, str):
586
+ output_text = response
587
+ else:
588
+ #output_text = response['choices'][0]['message']['content']
589
+ output_text = response.choices[0].message.content
590
+
591
+ # if handler == api_call_dglc:
592
+ # output_text = clean_gpt_out(output_text) # clean output
593
+
594
+
595
+ # write to cache
596
+
597
+ if map_query_input_output is not None:
598
+ if not key in map_query_input_output:
599
+ map_query_input_output[key] = {}
600
+
601
+ if output_text:
602
+ if output_text != "":
603
+ map_query_input_output[key][input_text] = output_text
604
+
605
+ with open(cache_fp, "w") as f:
606
+ json.dump(map_query_input_output, f)
607
+
608
+ if verbose:
609
+ print("API CALL REPLY FOR:\n", prompt, "\n", delimiter, input_text, delimiter, "\n=>\n", output_text, "\n")
610
+
611
+ return output_text
612
+
613
+ else:
614
+ return None
615
+
616
+
617
+ def call_model(input_text: str, prompt: str, model: str, temperature: int, handler, timeout_retry: int =5, delimiter: str = "```", InContextExamples: list[[str]] = [],
618
+ verbose: bool = True, args: argparse.Namespace=None):
619
+ """ call openai's API but take care of caching of resuts
620
+ input_text: input text
621
+ prompt: prompt
622
+ model: model name (as parameter of the query)
623
+ temperature: temperature (0: precise, 1: creative)
624
+ handler: delegate function that will make the call (not necessarily only OpenAI, could be any one)
625
+ """
626
+
627
+
628
+ if not(input_text) or input_text.strip=="" or not(prompt) or prompt.strip=="":
629
+ logging.warning("No text or promt supplied! Skypping it!")
630
+ return None
631
+
632
+ return call_model_with_caching(input_text, prompt, model, temperature, handler, None, None, timeout_retry, delimiter, InContextExamples, verbose, args=args)
633
+
634
+
635
+
636
+ def process_list(list_input_text: list[str], prompt: str, service_provider: str, model: str, temperature: int,
637
+ cache_prefix_fp: str = None, delimiter: str = "```", InContextExamples: list[[str]] = [], args: argparse.Namespace=None):
638
+ """ process a list of text with a prompt and a model
639
+ list_input_text: list input text
640
+ prompt: prompt
641
+ service provide: either "openai" or "dglc" for the moment
642
+ model: model name (as parameter of the query)
643
+ temperature: temperature (0: precise, 1: creative)
644
+ cache_prefix_fp: prefix of the file to which write content of cache after each call
645
+ """
646
+
647
+ if cache_prefix_fp is not None:
648
+ cache_fp = cache_prefix_fp + "___" + "__".join([service_provider, model, str(temperature)]).replace(" ", "_") + ".json"
649
+
650
+ if os.path.exists(cache_fp):
651
+ with open(cache_fp) as f:
652
+ map_query_input_output = json.load(f)
653
+ else:
654
+ map_query_input_output = {}
655
+ else:
656
+ map_query_input_output = None
657
+ cache_fp = None
658
+
659
+ handler = None
660
+ #if service_provider.lower() == "dglc": handler = api_call_dglc
661
+ if service_provider.lower() == "openai": handler = api_call_openai
662
+ if service_provider.lower() == "gptjrc": handler = api_call_gptjrc
663
+ #if service_provider.lower() == "hfonpremises": handler = api_call_HFonPremises
664
+
665
+ list_output_text = []
666
+ for input_text in tqdm(list_input_text):
667
+ output_text = call_model_with_caching(input_text, prompt, model, temperature, handler, map_query_input_output,
668
+ cache_fp, delimiter=delimiter, InContextExamples=InContextExamples, args=args)
669
+ list_output_text.append(output_text)
670
+
671
+ return list_output_text
672
+
673
+
674
+
675
+
676
+ if __name__ == "__main__":
677
+
678
+ USE_CACHE = False #True #False
679
+
680
+ #service_provider = "openai"
681
+ #model_name = "gpt-3.5-turbo-16k"
682
+ #
683
+ #
684
+ #service_provider = "dglc"
685
+ # dglc available models: 'OA_SFT_Pythia_12B', 'JRC_RHLF_13B', 'OA_GPT3.5', 'OA_GPT3'
686
+ # model_name = "gpt-3.5-turbo" #OpenAI name
687
+ # model_name = 'JRC_RHLF_13B'
688
+ #model_name = "OA_SFT_Pythia_12B" #EleutherAI-pythia-12b
689
+ # model_name = "OA_GPT3"
690
+ # model_name = "GPT@JRC_4"
691
+ #
692
+ #
693
+ #service_provider = "gptjrc"
694
+ #model_name = "gpt-35-turbo-0613"
695
+ #model_name = "gpt-35-turbo-16k"
696
+ #model_name = "gpt-4-32k" #GPT-4 with a context length of 32,768 tokens - around 116000
697
+ service_provider = "HFonPremises"
698
+
699
+ #model_name = "llama-3.1-70b-instruct"
700
+ #model_name = "llama-3.1-70b-instruct"
701
+
702
+ #model_name="nous-hermes-2-mixtral-8x7b-dpo"
703
+ #model_name = "nous-hermes-2-mixtral-8x7b-dpo"
704
+
705
+ #model_name="llama-3.1-8b-instruct"
706
+ #model_name = "llama-3.1-8b-instruct"
707
+ model_name = "llama-3.1-70b-instruct"
708
+
709
+ # temperature: temperature_value (0: precise, 1: creative)
710
+ temperature_value = 0.01 # 0.1
711
+
712
+ ##################################################################################################
713
+
714
+ #OpenAI ChatGPT API
715
+ if service_provider == "openai":
716
+ MyOpenAPIKey = ""
717
+ fkeyname="OpenAI-DigLifeAccount-APItoken.key"
718
+ if os.path.exists(fkeyname):
719
+ with open(fkeyname) as f:
720
+ MyOpenAPIKey = f.read()
721
+ else:
722
+ MyOpenAPIKey = os.environ['key_MyOpenAPI']
723
+ setup_openai(key=MyOpenAPIKey)
724
+
725
+ # # test api call
726
+ # r = api_call_openai("say hello world", "you will answer in Spanish", "gpt-3.5-turbo", 0)
727
+ # print(r)
728
+
729
+ # # test process list
730
+ # r = process_list(["hello world", "hello everybody"], "you will translate to Spanish", "openai", "gpt-3.5-turbo", 0)
731
+ # print(r)
732
+
733
+ # # process list with caching
734
+ # r = process_list(["hello world", "hello everybody"], "you will translate to Spanish", "openai", "gpt-3.5-turbo", 0, "UNITTEST")
735
+ # print(r)
736
+
737
+ #### GPT@JRC API
738
+ if service_provider == "gptjrc":
739
+ key_gptjrc = ""
740
+ fkeyname = "GPTJRC-APItoken.key"
741
+ if os.path.exists(fkeyname):
742
+ with open(fkeyname) as f:
743
+ key_gptjrc = f.read()
744
+ else:
745
+ key_gptjrc = os.environ['key_gptjrc']
746
+
747
+ os.environ['OPENAI_API_KEY'] = key_gptjrc
748
+ #setup_gptjrc(key_gptjrc)
749
+ setup_gptjrc()
750
+
751
+
752
+ #### DGLC API
753
+ if service_provider == "dglc":
754
+ key_dglc = ""
755
+ fkeyname = "DGLC-APItoken.key"
756
+ if os.path.exists(fkeyname):
757
+ with open(fkeyname) as f:
758
+ key_dglc = f.read()
759
+ else:
760
+ key_dglc = os.environ['key_dglc']
761
+ # setup_dglc(key=key_dglc)
762
+
763
+ # TEST OF DGLC API CALL
764
+ # input_text = "this morning a rabbit killed a hunter"
765
+ # print("\ntext = \n"+input_text)
766
+ # prompt = "please reformulate the text, add more details, the text should be between 200 and 500 characters:"
767
+ # print("\nquestion = \n" + prompt)
768
+ # model= "OA_SFT_Pythia_12B" #not available in OpenAI
769
+ # print("\nmodel = \n" + model)
770
+ # print("\n")
771
+ #
772
+ # r = api_call_dglc(prompt, input_text, model, 0.5)
773
+ # if r:
774
+ # print(r)
775
+ #
776
+ # print("\nDone!\n")
777
+
778
+
779
+ ###########################################################################
780
+
781
+
782
+ if USE_CACHE:
783
+ # cache_prefix_fp: prefix of the file to which write content of cache after each call
784
+ cache_prefix_fp = "LLMQUERYTEST"
785
+ cache_name = cache_prefix_fp + "___" + "__".join([service_provider, model_name, str(temperature_value)]).replace(" ", "_") + ".json"
786
+
787
+ if os.path.exists(cache_name):
788
+ with open(cache_name) as f:
789
+ load_map_query_input_output = json.load(f)
790
+ else:
791
+ load_map_query_input_output = {}
792
+
793
+ myPromt = f"""
794
+ translate in Spanish the text below, delimited by triple \
795
+ Text:
796
+ """
797
+
798
+ myDelimiter = "```"
799
+
800
+ ###
801
+
802
+ #example for counting number of tokens of the overall prompt for the model
803
+
804
+ # input_text = "one, two, three, a step fortward Mary"
805
+ # overall_string = myPromt + myDelimiter + input_text + myDelimiter
806
+ # nt = token_counter(overall_string, model_name)
807
+ # print("\nNumber of Tokens in the example = "+str(nt))
808
+
809
+ encod = encoding_getter(model_name)
810
+ print("\nencodName = " + str(encod.name))
811
+
812
+ InContextExamples = []
813
+ # InContextExamples = [["ADENOVIRUS - USA (02): (NEW JERSEY) UPDATE A ProMED-mail post http://www.promedmail.org ProMED-mail is a program of the International Society for Infectious Diseases http://www.isid.org Date: Sun 28 Oct 2018 3:12 PM Source: CBS news [edited] https://www.cbsnews.com/news/adenovirus-outbreak-new-jersey-wanaque-center- facility-9th-child-dies-2018-10-28/ Another child has died amid a deadly virus outbreak at a New Jersey rehabilitation, bringing the total number of deaths to 9, officials said [Sun 28 Oct 2018]. The state\'s Department of Health said the victim was a \"medically fragile\" child who had a confirmed case of adenovirus. The unidentified child died Saturday evening [27 Oct 2018] at the Wanaque Center for Nursing and Rehabilitation in Haskell, the department said. There have been 25 cases associated with the outbreak. \"This is a tragic situation, and our thoughts are with the families who are grieving right now,\" Health Commissioner Dr Shereef Elnahal said in a statement. \"We are working every day to ensure all infection control protocols are continuously followed and closely monitoring the situation at the facility.\" Adenoviruses are a family of viruses that account for about 5 to 10 percent of fevers in young children, but most patients recover. The infections can affect the tissue linings of the respiratory tract, eyes, intestines, urinary tract and nervous system, causing illnesses ranging from a cold to bronchitis to pneumonia to pink eye. The children at Wanaque appear to have been more susceptible to serious infections due to their other medical conditions. Children at the center are severely disabled, with some living in comas, and for many, it is their permanent home, the Bergen Record reports. Many will never walk or talk, and some have spent virtually their whole lives there, according to the paper. . Communicated by: ProMED-mail Rapporteur Kunihiko Iizuka [Human adenoviruses (HAdVs) are non-enveloped, linear double-stranded DNA viruses encapsidated within a protein shell and have been categorized into 6 species (A-F) that contain 51 immunologically distinct serotypes (Fields virology. 5th ed. Philadelphia (PA): Lippincott-Raven; 2007. p. 2395-436). HAdVs most commonly cause acute respiratory disease; however, depending on the infecting HAdV serotype and tropism resulting from differential host receptor use, the wide variety of symptoms can include pneumonia, febrile upper respiratory illness, conjunctivitis, cystitis, and gastroenteritis (Principles and practice of infectious diseases. 5th ed. Vol 2. Philadelphia (PA): Churchill Livingstone; 2005. p. 1835-41). The severity of disease appears dependent on the immunocompetence and cardiopulmonary health of the host, and the spectrum of disease can range from subclinical to severe respiratory distress and death. Immunocompromised patients (such as bone marrow transplant recipients) are particularly susceptible to HAdV infection, resulting in severe illness and deaths, whereas illness in immunocompetent patients generally resolves without major complication. The outbreak report above involves young children that are in a healthcare facility and immunocompromised on account of underlying co- morbid conditions. Adenovirus associated morbidity and mortality in this setting would require intensive infection control measures. In immunocompromised patients, several drugs, such as cidofovir, ribavirin, ganciclovir, and vidarabine, have been used to treat adenovirus infections. Most of these agents are virostatic, may induce drug resistance, and have significant risks of toxicities, as well as risks to healthcare staff [e.g., aerosolized ribavirin - Mod.ML]. - Mod.UBA HealthMap/ProMED map available at: New Jersey, United States: https://promedmail.org/promed-post?place=6117463,232 ] See Also Adenovirus - USA: (NJ) children, fatal 20181025.6108470 .uba/mj/ml",
814
+ # "{\"virus\": \"ADENOVIRUS\", \"country\": \"USA\", \"date\": \"2018-10-28\", \"cases\": \"25\", \"deaths\": \"9\"}"],
815
+ # ["NIPAH VIRUS - INDIA (14): (KERALA) * A ProMED-mail post http://www.promedmail.org ProMED-mail is a program of the International Society for Infectious Diseases http://www.isid.org Date: Tue 3 Jul 2018 Source: MediBulletin [edited] https://medibulletin.com/2018/07/03/bats-indicted-in-kerala-nipah-outbreak- icmr-sends-paper-to-lancet/ Putting to rest suspense about the source of the Nipah virus infections in Kerala, scientists from the Indian Council of Medical Research have now found the virus in bats that were caught from the affected areas. At least 17 people died of Nipah infection in Mallapuram and Kozhikode districts of Kerala over April and May [2018]. While the 1st batch of bats caught from the well in Kozhikode in the house from where the 1st case was reported, had tested negative; of the 2nd batch of 52 fruit bats, 19.2 percent were found to carry the virus. The findings will be published in The Lancet. Health minister J P Nadda was informed about the findings in a meeting last week. In the meeting, scientists from ICMR and public health officials also told the minister that circumstances have now improved enough for the state to be declared Nipah free. The incubation period of Nipah is 5 to 14 days. The last case was in May [2018] and now that 2 incubation periods have elapsed without any fresh cases, the specter of the dreaded disease seems to be finally receding. . Communicated by: ProMED-mail <[email protected]> [It is good to learn that there have been no additional cases of Nipah virus infection in Kerala. As was mentioned earlier, it is not surprising that the bats taken from the well were negative for the virus. Giant fruit bats (flying foxes genus _Pteropus_), the reservoir of Nipah virus in Bangladesh and Malaysia, do not roost in wells. They roost in tree tops. The species sampled in the 2nd batch of bats was not mentioned, but were likely flying foxes. It is fortunate that virus positive bats were found in this 2nd sampling. As commented earlier, bats \"may only be infectious for a week or 2, and then they clear the virus and they\'re no longer infectious,\" said Jonathan Epstein, a veterinarian and epidemiologist at EcoHealth Alliance, New York, who has, for over a decade, studied Nipah outbreaks and the bats that cause them, in Malaysia, India and Bangladesh. \"That\'s why these outbreaks are relatively rare events, given the fact that these bats are so abundant and so common but very few of them are ever actually shedding virus at a given time.\" Epstein and others conducted an experimental study of _Pteropus_ bats in 2011 and found that the time window in which the bats are capable of passing on the infection to other animals or humans is quite small. In fact, the virus can\'t be found in experimentally infected bats after a few weeks. The few bats in an infected population that could be shedding the virus may be doing so in low quantities and for a short duration. \"Finding that bats don\'t have Nipah virus at the time of sampling certainly doesn\'t mean that it didn\'t come from those bats, particularly _P. medius_,\" Epstein said. \"The overwhelming abundance of evidence really shows that this bat is the reservoir for Nipah virus on the subcontinent in Bangladesh and in India.\" - Mod.TY Maps of India can be accessed at: http://www.mapsofindia.com/maps/india/india-political-map.htm HealthMap/ProMED map available at: Kerala State, India: https://promedmail.org/promed-post?place=5887151,308 ] See Also Nipah virus - India (13): (KL) 20180611.5851326 Nipah virus - India (12) 20180603.5836554 Nipah virus - India (11): (KL) 20180602.5835342 Nipah virus - India (10): (KL) 20180602.5833137 Nipah virus - India (09): (WB ex KL) susp. 20180530.5829184 Nipah virus - India (08): (KR ex KL) susp. 20180529.5826769 Nipah virus - India (07) 20180528.5822566 Nipah virus - India (06): (KL,KA) 20180526.5819777 Nipah virus - India (05): (KL,TG) 20180525.5817917 Nipah virus - India (04): (KL, KA) 20180524.5815473 Nipah virus - India (03): (KL) conf. 20180522.5812214 Nipah virus - India (02): (KL) conf 20180521.5809003 Nipah virus - India: (KL) susp 20180521.5807513 2007 . Nipah virus, fatal - India (West Bengal) (02) 20070511.1514 Nipah virus, fatal - India (West Bengal) 20070508.1484 Undiagnosed deaths - Bangladesh, India (04) 20070504.1451 .ty/ao/jh",
816
+ # "{\"virus\": \"Nipah virus\", \"country\": \"India\", \"date\": \"2018-07-03\", \"cases\": \"None\", \"deaths\": \"17\"}"],
817
+ # ["UNDIAGNOSED RESPIRATORY ILLNESS - USA: (NEW YORK), ex MIDDLE EAST, FLIGHT PASSENGERS AND CREW, REQUEST FOR INFORMATION * A ProMED-mail post http://www.promedmail.org ProMED-mail is a program of the International Society for Infectious Diseases http://www.isid.org [1] Date: Wed 5 Sep 2018 Source: Stat News [edited] https://www.statnews.com/2018/09/05/plane-quarantined-at-kennedy-airport-amid- reports-of-100-ill-passengers/ An Emirates Airline flight was held in quarantine for a period at New York\'s John F. Kennedy International Airport after a large number of passengers were reported feeling ill during the flight. The airline said in a statement that about 10 passengers on the flight from Dubai to New York had become ill on board. But the Centers for Disease Control and Prevention [CDC] said the estimated number was about 100. Passengers and some crew complained of illness including cough, according to the CDC; some had developed a fever. \"CDC public health officers are working with port authority, EMS, and CBP officials to evaluate passengers including taking temperatures and making arrangements for transport to local hospitals,\" the CDC said. \"Passengers who are not ill will be allowed to continue with their travel plans, and if necessary will be followed up with by health officials.\" A spokesman for New York Mayor Bill de Blasio said 19 passengers had taken ill - 10 were sent to a hospital and another 9 refused medical attention. There were approximately 521 passengers on the flight. A number of the passengers on the flight were returning from the Hajj, the annual mass pilgrimage to Mecca, in Saudi Arabia, a source told STAT. It was unclear if the people who were ill were the same passengers who had attended. Saudi Arabia has reported cases of MERS, Middle East respiratory syndrome, which passes to people from camels. But the fact that so many people became ill during the flight would make MERS seem an unlikely cause. [Byline: Helen Branswell] . Communicated by: Meghan May University of New England <[email protected]> [2] Date: Wed 5 Sep 2018 Source: BBC [edited] https://www.bbc.com/news/world-us-canada-45425412 A total of 19 people have been taken ill after an Emirates airline plane landed in New York, officials say. The plane was quarantined at JFK airport as those on board were checked by health officials. As many as 10 were taken to hospital but others refused treatment. The US Centers for Disease Control and Prevention (CDC) said that initially about 100 people including some crew had complained of illness. Flight 203 from Dubai landed at 09:10 (13.10 GMT) with 521 passengers. Emergency vehicles were seen on the runway as it landed. Soon afterwards, Emirates airline tweeted that the sick passengers were being attended to and those who were unaffected would be allowed to leave the plane. The CDC said in a statement that is was \"aware of an Emirates flight from Dubai that arrived this morning at JFK\". \"Approximately 100 passengers, including some crew on the flight, complained of illness including cough and some with fever. \"CDC public health officers are working with. officials to evaluate passengers including taking temperatures and making arrangements for transport to local hospitals those that need care.\" Later Eric Phillips, spokesman for New York Mayor Bill de Blasio, confirmed that all the passengers were off the plane and the sick people had been taken to hospital. He said that some of the passengers had originally come from the Saudi Arabian city of Mecca, which was currently experiencing a flu outbreak, and that the passengers\' symptoms were \"pointing to the flu\". . Communicated by: ProMED-mail <[email protected]> [3] Date: Wed 5 Sep 2018, 10:55 AM ET Source: NPR [edited] https://www.npr.org/2018/09/05/644826743/emirates-airline-says- about-10-passengers-fell-ill-on-flight-to-new-york Health and safety officials are investigating an illness that struck people on an Emirates Airline flight from Dubai, United Arab Emirates, to New York\'s John F. Kennedy International Airport on Wednesday morning. A total of 7 crew members and 3 passengers were taken to the hospital, Emirates Airline said. It added that [Wed 5 Sep 2018] return flight from New York to Dubai would leave 3 hours late. The Centers for Disease Control and Prevention said around 100 people on the overnight Flight 203 had complained of illness. For some, the symptoms were limited to a cough; others had a fever. An Emirates A380 in quarantine at JFK Airport right now awaiting CDC officials after about 100 passengers became ill with fevers over 100 degrees and coughing. Flight 203 had just arrived from Dubai. Ten people were taken off the plane for treatment at Jamaica Hospital Medical Center, said Eric Phillips, press secretary for New York City Mayor Bill de Blasio. He said 9 others were found to be sick but refused medical attention. The aircraft was carrying around 521 passengers. Health officials allowed people to disembark only after checking each one for symptoms, Phillips stated, adding, \"The plane\'s been quarantined and the CDC is on the scene.\" As for what the sickness might be, Phillips referred to a \"flu outbreak\" in Mecca, Saudi Arabia, and said that might be a possibility, stating, \"It appears some of the ill passengers came from Mecca before getting on in Dubai.\" [One] passenger called it the \"worst flight ever,\" saying on Twitter that the plane \"was basically a flying infirmary. Many of these people should never have been allowed to board.\" By around noon, 432 passengers had been cleared and allowed to go to the customs area, according to Phillips. A few others who showed symptoms were held for treatment and possible transport to the hospital. NPR\'s Rob Stein reports that the Centers for Disease Control and Prevention is working with local authorities. The Airbus A380 jet was isolated on the tarmac at JFK, as officials took stock of the situation. Images from the scene showed a row of ambulances alongside the aircraft. [Another] passenger said via Twitter that CDC staff came onto the plane and that everyone aboard was asked to fill out a form providing their contact information for the next 3 weeks. [Byline: Bill Chappell] . Communicated by: ProMED-mail <[email protected]> [Mention a plane load of individuals with febrile respiratory symptoms coming from the Middle East and immediate panic of \"could this be MERS-CoV?\" Equally or more serious than this is an outbreak of influenza. According to the media coverage, there is an ongoing outbreak of influenza in Mecca (Makkah) (Saudi Arabia, where Hajji\'s congregate) now, concurrent with the Hajj, and a rapid onset of respiratory symptoms is more likely to be influenza than it is MERS- CoV. We still do not have information as to the origins of the ill passengers to know if they were beginning their trips in Mecca and were returning Hajji\'s or if they began their travels elsewhere. Presumably the crew members began their working journeys in Dubai, but may have commuted in from elsewhere in or outside of the region (and there is mention of ill crew members). Returning to the possibility that this is related to the Hajj, a mass gathering of more than 2 million individuals from all over the world, making a religious pilgrimage to Mecca, it is interesting to review the Saudi Hajj/Umrah health requirements. While some vaccines are mandatory (meningitis vaccine, polio if coming from a country with ongoing poliovirus transmission, and yellow fever if coming from a known yellow fever endemic area, in contrast, influenza vaccine is recommended but not obligatory. \"Seasonal Influenza: \"The Saudi Ministry of Health recommends that international pilgrims be vaccinated against seasonal influenza before arrival into the Kingdom of Saudi Arabia, particularly those at increased risk of severe influenza diseases, including pregnant women, children under 5 years, the elderly, and individuals with underlying health conditions such as HIV/AIDS, asthma, and chronic heart or lung diseases. In Saudi Arabia, seasonal influenza vaccination is recommended for internal pilgrims, particularly those at risk described above, and all health-care workers in the Hajj premises.\" https://www.saudiembassy.net/hajj-and-umrah-health-requirements Given concerns re: possible MERS-CoV transmission to pilgrims visiting Saudi Arabia, many countries have increased surveillance of respiratory illnesses in returning pilgrims (notably post Hajj), and in the 6 years since identification of the MERS-CoV, there have been no cases among returning Hajji\'s and just a handful of cases among individuals returning from having performed the Umrah pilgrimage. Movement and exposures of visiting Hajj pilgrims are controlled - camels are not permitted in the area where Hajjis are congregated, healthcare workers and others identified as contacts of confirmed MERS-CoV cases are not permitted to go to the Hajj area Studies addressing the etiologies of respiratory illnesses in returning Hajji\'s have identified influenza virus, respiratory syncytial virus, parainfluenza virus, adenovirus and rhinovirus (see Respiratory infections - UK: Hajj returnees 20151011.3706464 and Respiratory virus infections - Saudi Arabia: Hajj pilgrims 2012 20130730.1854631 ). Note that the incubation period for influenza ranges from 1-4 days so transmission during travel is a plausible event. The actual number of individual identified as ill enough to require medical treatment at a hospital seems to be between 19 and 27 (taking into account the additional 8 identified and mentioned by the New York City\'s mayor\'s spokesperson in a tweet). Presumably these individuals had a fever above 100 F (37.8 C) and cough. The extension of the count to approaching 100 individuals may include those with a cough, possibly in the early stages of illness (influenza frequently begins with a dry cough), and others possibly coughing in response to hearing others coughing around them (think theater or lectures when coughing begins.) As the rapid respiratory virus screening tests should be available in a reasonable amount of time, we will post information as it becomes available, but should any knowledgeable sources have additional information that can be shared with us, ProMED would be very grateful. The HealthMap/ProMED maps can be found at: New York State, United States: https://promedmail.org/promed-post?place=6009759,234 Middle East: https://promedmail.org/promed-post?place=6009759,12214 . Mod.MPP] See Also MERS-CoV (29): UK (England) ex Saudi Arabia, Risk Assessment ECDC 20180830.5996187 2017 . MERS-CoV (63): Saudi Arabia (QS, TB), WHO : 20171009.5369268 2016 . Health hazards - Saudi Arabia: Hajj, Umrah, vaccination requirements 20160715.4346367 MERS-CoV (71): Saudi Arabia (MK), pilgrimage caution, WHO 20160623.4305152 2015 . MERS-CoV (138): Saudi Arabia, Jordan, post Hajj surveillance, RFI 20151009.3704734 MERS-CoV (136): Kuwait WHO, Saudi Arabia MOH, camel, Hajj 20150924.3666811 MERS-CoV (131): Saudi Arabia, animal reservoir, camels, Hajj, RFI 20150914.3643612 MERS-CoV (130): Saudi Arabia, animal reservoir, camels, Hajj 20150912.3641457 MERS-CoV (114): Saudi Arabia, animal reservoir, camels, Hajj 20150823.3597358 Respiratory infections - UK: Hajj returnees 20151011.3706464 Respiratory infections - Canada: (AB) Hajj returnees 20151020.3729641 Influenza (51): Germany ex Saudi Arabia, Hajj returnee, RFI 20151009.3704297 Influenza (49): Canada ex Saudi Arabia, Hajj returnees, susp., RFI, Alert 20151005.3693052 2014 . Meningitis, meningococcal - Saudi Arabia: prevention, Hajj travel advice 20140815.2692227 2013 . Respiratory virus infections - Saudi Arabia: Hajj pilgrims 2012 20130730.1854631 2012 . Health hazards - Saudi Arabia: updated Hajj advice 20121011.1338172 2009 . Influenza pandemic (H1N1) 2009 (113): Saudi Arabia, Hajj fatalities 20091122.4013 Influenza pandemic (H1N1) 2009 (109): Saudi Arabia, Hajj pilgrims 20091120.3997 2006 . Influenza - Saudi Arabia: Hajj concerns 20061209.3478 .mpp/ao/mpp",
818
+ # "{\"virus\": \"None\", \"country\": \"United Arab Emirates\", \"date\": \"2018-09-05\", \"cases\": \"19\", \"deaths\": \"None\"}"]]
819
+
820
+ if InContextExamples:
821
+ ntotExamplesTokens = 0
822
+ for row in InContextExamples:
823
+ for col in row:
824
+ nt = token_counter(col, model_name)
825
+ #print("\nNumber of Tokens in the example = " + str(nt))
826
+ ntotExamplesTokens = ntotExamplesTokens + nt
827
+ #
828
+ print("\nNumber of Tokens of the all examples = " + str(ntotExamplesTokens))
829
+
830
+ ###
831
+
832
+ if service_provider == "openai":
833
+ if USE_CACHE:
834
+ lambda_model = partial(call_model_with_caching, prompt=myPromt, model=model_name,
835
+ temperature=temperature_value, delimiter=myDelimiter, InContextExamples=InContextExamples, handler=api_call_openai,
836
+ map_query_input_output=load_map_query_input_output, cache_fp=cache_name, verbose=True)
837
+ else:
838
+ lambda_model = partial(call_model, prompt=myPromt, model=model_name,
839
+ temperature=temperature_value, delimiter=myDelimiter, InContextExamples=InContextExamples, handler=api_call_openai,
840
+ verbose=True)
841
+ elif service_provider == "gptjrc":
842
+ if USE_CACHE:
843
+ lambda_model = partial(call_model_with_caching, prompt=myPromt, model=model_name,
844
+ temperature=temperature_value, delimiter=myDelimiter, InContextExamples=InContextExamples, handler=api_call_gptjrc,
845
+ map_query_input_output=load_map_query_input_output, cache_fp=cache_name, verbose=True)
846
+ else:
847
+ lambda_model = partial(call_model, prompt=myPromt, model=model_name,
848
+ temperature=temperature_value, delimiter=myDelimiter, InContextExamples=InContextExamples, handler=api_call_gptjrc,
849
+ verbose=True)
850
+ # elif service_provider == "dglc":
851
+ # if USE_CACHE:
852
+ # lambda_model = partial(call_model_with_caching, prompt=myPromt, model=model_name, temperature=temperature_value, delimiter=myDelimiter, InContextExamples=InContextExamples,
853
+ # handler=api_call_dglc, map_query_input_output=load_map_query_input_output, cache_fp=cache_name, verbose=True)
854
+ # else:
855
+ # lambda_model = partial(call_model, prompt=myPromt, model=model_name,
856
+ # temperature=temperature_value, delimiter=myDelimiter, InContextExamples=InContextExamples, handler=api_call_dglc,
857
+ # verbose=True)
858
+
859
+ # elif service_provider == "HFonPremises":
860
+ # if USE_CACHE:
861
+ # lambda_model = partial(call_model_with_caching, prompt=myPromt, model=model_name,
862
+ # temperature=temperature_value, delimiter=myDelimiter, InContextExamples=InContextExamples, handler=api_call_HFonPremises,
863
+ # map_query_input_output=load_map_query_input_output, cache_fp=cache_name, verbose=True)
864
+ # else:
865
+ # lambda_model = partial(call_model, prompt=myPromt, model=model_name,
866
+ # temperature=temperature_value, delimiter=myDelimiter, InContextExamples=InContextExamples, handler=api_call_HFonPremises,
867
+ # verbose=True)
868
+
869
+
870
+
871
+ if lambda_model:
872
+ df = pd.DataFrame([["one, two, three, a step fortward Mary"], ["one, two, three, a step back"]], columns=["text"])
873
+ df["text_es"] = df["text"].apply(lambda_model)
874
+
875
+ print("\n")
876
+ print(df)
877
+
878
+ print("\nEnd Computations")
879
+
880
+
881
+
nerBio.py ADDED
The diff for this file is too large to render. See raw diff
 
retrieverRAG_testing.py ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://www.mixedbread.ai/blog/mxbai-embed-large-v1
2
+ # https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1
3
+
4
+ import os
5
+ import time
6
+ import pandas as pd
7
+ import numpy as np
8
+ from typing import Dict
9
+
10
+ import torch
11
+ from transformers import AutoModel, AutoTokenizer
12
+ from sentence_transformers.util import cos_sim
13
+ from accelerate import Accelerator # Import from accelerate
14
+ from scipy.stats import zscore
15
+
16
+ # Set up environment variables for Hugging Face caching
17
+ os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
18
+ os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
19
+ os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
20
+
21
+ # Initialize the Accelerator
22
+ accelerator = Accelerator()
23
+
24
+ # Use the device managed by Accelerator
25
+ device = accelerator.device
26
+ print("Using accelerator device =", device)
27
+
28
+
29
+ # 1. Load the model and tokenizer
30
+ model_id_Retriever = 'mixedbread-ai/mxbai-embed-large-v1'
31
+ tokenizer_Retriever = AutoTokenizer.from_pretrained(model_id_Retriever)
32
+ modelRetriever = AutoModel.from_pretrained(model_id_Retriever)
33
+
34
+ # Accelerate prepares the model (e.g., moves to the appropriate device)
35
+ modelRetriever = accelerator.prepare(modelRetriever)
36
+
37
+
38
+
39
+
40
+ # Define the transform_query function
41
+ def transform_query(queryText: str) -> str:
42
+ """For retrieval, add the prompt for queryText (not for documents)."""
43
+ return f'Represent this sentence for searching relevant passages: {queryText}'
44
+
45
+ # Define the pooling function
46
+ def pooling(outputs: torch.Tensor, inputs: Dict, strategy: str = 'cls') -> np.ndarray:
47
+ if strategy == 'cls':
48
+ outputs = outputs[:, 0]
49
+ elif strategy == 'mean':
50
+ outputs = torch.sum(
51
+ outputs * inputs["attention_mask"][:, :, None], dim=1
52
+ ) / torch.sum(inputs["attention_mask"], dim=1, keepdim=True)
53
+ else:
54
+ raise NotImplementedError
55
+ return outputs.detach().cpu().numpy()
56
+
57
+
58
+ def retrievePassageSimilarities(queryText, passages):
59
+ # Create the docs list by adding the transformed queryText and then the passages
60
+ docs = [transform_query(queryText)] + passages
61
+
62
+ # 2. Encode the inputs
63
+ inputs = tokenizer_Retriever(docs, padding=True, return_tensors='pt')
64
+
65
+ # Move inputs to the right device using accelerator
66
+ inputs = {k: v.to(device) for k, v in inputs.items()}
67
+ with torch.no_grad():
68
+ outputs = modelRetriever(**inputs).last_hidden_state
69
+ embeddings = pooling(outputs, inputs, 'cls')
70
+
71
+ similarities = cos_sim(embeddings[0], embeddings[1:])
72
+
73
+ #print('similarities:', similarities)
74
+
75
+ return similarities
76
+
77
+
78
+
79
+ def RAG_retrieval_Base(queryText,passages, min_threshold=0.0, max_num_passages=None):
80
+
81
+ try:
82
+ similarities=retrievePassageSimilarities(queryText, passages)
83
+
84
+ #Create a DataFrame
85
+ df = pd.DataFrame({
86
+ 'Passage': passages,
87
+ 'Similarity': similarities.flatten() # Flatten the similarity tensor/array to ensure compatibility
88
+ })
89
+
90
+ # Filter the DataFrame based on the similarity threshold
91
+ df_filtered = df[df['Similarity'] >= min_threshold]
92
+
93
+ # If max_num_passages is specified, limit the number of passages returned
94
+ if max_num_passages is not None:
95
+ df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity')
96
+
97
+ df_filtered = df_filtered.sort_values(by='Similarity', ascending=False)
98
+
99
+ # Return the filtered DataFrame
100
+ return df_filtered
101
+
102
+ except Exception as e:
103
+ # Log the exception message or handle it as needed
104
+ print(f"An error occurred: {e}")
105
+ return pd.DataFrame() # Return an empty DataFrame in case of error
106
+
107
+
108
+
109
+ def RAG_retrieval_Z_scores(queryText, passages, z_threshold=1.0, max_num_passages=None, min_threshold=0.5):
110
+ try:
111
+ # Encoding and similarity computation remains the same
112
+
113
+ similarities = retrievePassageSimilarities(queryText, passages)
114
+
115
+ # Calculate z-scores for similarities
116
+ z_scores = zscore(similarities.flatten())
117
+
118
+ # Create a DataFrame with passages, similarities, and z-scores
119
+ df = pd.DataFrame({
120
+ 'Passage': passages,
121
+ 'Similarity': similarities.flatten(),
122
+ 'Z-Score': z_scores
123
+ })
124
+
125
+ # Filter passages based on z-score threshold
126
+ df_filtered = df[df['Z-Score'] >= z_threshold]
127
+
128
+ if min_threshold:
129
+ # Filter the DataFrame also on min similarity threshold
130
+ df_filtered = df[df['Similarity'] >= min_threshold]
131
+
132
+ # If max_num_passages is specified, limit the number of passages returned
133
+ if max_num_passages is not None:
134
+ df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity')
135
+
136
+ # Sort by similarity (or z-score if preferred)
137
+ df_filtered = df_filtered.sort_values(by='Similarity', ascending=False)
138
+
139
+ return df_filtered
140
+
141
+ except Exception as e:
142
+ # Log the exception message or handle it as needed
143
+ print(f"An error occurred: {e}")
144
+ return pd.DataFrame() # Return an empty DataFrame in case of error
145
+
146
+
147
+
148
+
149
+ def RAG_retrieval_Percentile(queryText, passages, percentile=90,max_num_passages=None, min_threshold=0.5):
150
+ try:
151
+ # Encoding and similarity computation remains the same
152
+
153
+ similarities = retrievePassageSimilarities(queryText, passages)
154
+
155
+ # Determine threshold based on percentile
156
+ threshold = np.percentile(similarities.flatten(), percentile)
157
+
158
+ # Create a DataFrame
159
+ df = pd.DataFrame({
160
+ 'Passage': passages,
161
+ 'Similarity': similarities.flatten()
162
+ })
163
+
164
+ # Filter using percentile threshold
165
+ df_filtered = df[df['Similarity'] >= threshold]
166
+
167
+ if min_threshold:
168
+ # Filter the DataFrame also on min similarity threshold
169
+ df_filtered = df[df['Similarity'] >= min_threshold]
170
+
171
+ # If max_num_passages is specified, limit the number of passages returned
172
+ if max_num_passages is not None:
173
+ df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity')
174
+
175
+ # Sort by similarity
176
+ df_filtered = df_filtered.sort_values(by='Similarity', ascending=False)
177
+
178
+ return df_filtered
179
+
180
+ except Exception as e:
181
+ # Log the exception message or handle it as needed
182
+ print(f"An error occurred: {e}")
183
+ return pd.DataFrame() # Return an empty DataFrame in case of error
184
+
185
+
186
+
187
+ def RAG_retrieval_TopK(queryText, passages, top_fraction=0.1, max_num_passages=None, min_threshold=0.5):
188
+ try:
189
+ # Encoding and similarity computation (assuming retrievePassageSimilarities is defined elsewhere)
190
+ similarities = retrievePassageSimilarities(queryText, passages)
191
+
192
+ # Calculate the number of passages to select based on top fraction
193
+ num_passages_TopFraction = max(1, int(top_fraction * len(passages)))
194
+
195
+ # Create a DataFrame
196
+ df = pd.DataFrame({
197
+ 'Passage': passages,
198
+ 'Similarity': similarities.flatten()
199
+ })
200
+
201
+ # Select the top passages dynamically
202
+ df_filtered = df.nlargest(num_passages_TopFraction, 'Similarity')
203
+
204
+ if min_threshold:
205
+ # Filter the DataFrame also on min similarity threshold
206
+ df_filtered = df_filtered[df_filtered['Similarity'] >= min_threshold]
207
+
208
+ # If max_num_passages is specified, limit the number of passages returned
209
+ if max_num_passages is not None:
210
+ df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity')
211
+
212
+ # Sort by similarity
213
+ df_filtered = df_filtered.sort_values(by='Similarity', ascending=False)
214
+
215
+ return df_filtered
216
+
217
+ except Exception as e:
218
+ # Log the exception message or handle it as needed
219
+ print(f"An error occurred: {e}")
220
+ return pd.DataFrame() # Return an empty DataFrame in case of error
221
+
222
+
223
+
224
+ if __name__ == '__main__':
225
+
226
+ queryText = 'A man is eating a piece of bread'
227
+
228
+ # Define the passages list
229
+ passages = [
230
+ "A man is eating food.",
231
+ "A man is eating pasta.",
232
+ "The girl is carrying a baby.",
233
+ "A man is riding a horse.",
234
+ ]
235
+
236
+ #df_retrieved = RAG_retrieval_Base(queryText, passages)
237
+ #df_retrieved = RAG_retrieval_Base(queryText, passages, min_threshold=0.5)
238
+ #df_retrieved = RAG_retrieval_Base(queryText, passages, max_num_passages=3)
239
+ df_retrieved = RAG_retrieval_Base(queryText, passages, min_threshold=0.5, max_num_passages=3)
240
+
241
+ #df_retrieved = RAG_retrieval_Z_scores(queryText, passages, z_threshold=1.0)
242
+ #df_retrieved = RAG_retrieval_Z_scores(queryText, passages, z_threshold=1.0,max_num_passages=3)
243
+
244
+ #df_retrieved = RAG_retrieval_Percentile(queryText, passages, percentile=80)
245
+ # df_retrieved = RAG_retrieval_Percentile(queryText, passages, percentile=80, max_num_passages=3)
246
+
247
+ ##df_retrieved = RAG_retrieval_TopK(queryText, passages, top_fraction=0.2)
248
+ #df_retrieved = RAG_retrieval_TopK(queryText, passages, top_fraction=0.2, max_num_passages=3)
249
+
250
+
251
+ print(df_retrieved)
252
+
253
+ #labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
254
+
255
+
256
+ print("end of computations")
257
+
258
+ # VERSION WITHOUT ACCELERATE
259
+ #
260
+ # #https://www.mixedbread.ai/blog/mxbai-embed-large-v1
261
+ # #https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1
262
+ #
263
+ # import os
264
+ #
265
+ # os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
266
+ # os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
267
+ # os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
268
+ #
269
+ # import time
270
+ # import pandas as pd
271
+ # import numpy as np
272
+ #
273
+ #
274
+ #
275
+ # from typing import Dict
276
+ #
277
+ # import torch
278
+ # import numpy as np
279
+ # from transformers import AutoModel, AutoTokenizer
280
+ # from sentence_transformers.util import cos_sim
281
+ #
282
+ # # For retrieval you need to pass this prompt. Please find our more in our blog post.
283
+ # def transform_queryText(queryText: str) -> str:
284
+ # """ For retrieval, add the prompt for queryText (not for documents).
285
+ # """
286
+ # return f'Represent this sentence for searching relevant passages: {queryText}'
287
+ #
288
+ # # The model works really well with cls pooling (default) but also with mean pooling.
289
+ # def pooling(outputs: torch.Tensor, inputs: Dict, strategy: str = 'cls') -> np.ndarray:
290
+ # if strategy == 'cls':
291
+ # outputs = outputs[:, 0]
292
+ # elif strategy == 'mean':
293
+ # outputs = torch.sum(
294
+ # outputs * inputs["attention_mask"][:, :, None], dim=1) / torch.sum(inputs["attention_mask"], dim=1, keepdim=True)
295
+ # else:
296
+ # raise NotImplementedError
297
+ # return outputs.detach().cpu().numpy()
298
+ #
299
+ # # 1. load model
300
+ # model_id = 'mixedbread-ai/mxbai-embed-large-v1'
301
+ # tokenizer = AutoTokenizer.from_pretrained(model_id)
302
+ # model = AutoModel.from_pretrained(model_id).cuda()
303
+ #
304
+ # queryText = 'A man is eating a piece of bread'
305
+ #
306
+ # # Define the passages list
307
+ # passages = [
308
+ # "A man is eating food.",
309
+ # "A man is eating pasta.",
310
+ # "The girl is carrying a baby.",
311
+ # "A man is riding a horse.",
312
+ # ]
313
+ #
314
+ # # Create the docs list by adding the transformed queryText and then the passages
315
+ # docs = [transform_queryText(queryText)] + passages
316
+ #
317
+ # # 2. encode
318
+ # inputs = tokenizer(docs, padding=True, return_tensors='pt')
319
+ # for k, v in inputs.items():
320
+ # inputs[k] = v.cuda()
321
+ # outputs = model(**inputs).last_hidden_state
322
+ # embeddings = pooling(outputs, inputs, 'cls')
323
+ #
324
+ # similarities = cos_sim(embeddings[0], embeddings[1:])
325
+ #
326
+ # print('similarities:', similarities)
327
+ #
328
+ #
329
+ # # Create a DataFrame
330
+ # df = pd.DataFrame({
331
+ # 'Passage': passages,
332
+ # 'Similarity': similarities.flatten() # Flatten the similarity tensor/array to ensure compatibility
333
+ # })
334
+ #
335
+ # # Display the DataFrame
336
+ # print(df)
337
+ #
338
+ #
339
+ # print("end of computations")
virtuosoQueryRest.py CHANGED
@@ -3,11 +3,14 @@ from requests.auth import HTTPDigestAuth, HTTPBasicAuth
3
  import ssl
4
  import json
5
 
 
6
 
 
 
7
 
8
 
9
 
10
-
11
  def execute_query(endpoint, query, auth):
12
  headers = {
13
  'Content-Type': 'application/x-www-form-urlencoded',
@@ -144,32 +147,26 @@ if __name__ == '__main__':
144
 
145
  #############
146
 
147
- #query = 'SELECT * WHERE { ?s ?p ?o } LIMIT 100'
148
-
149
- # word ="acute sinusitis"
150
- # query = f"""
151
- # SELECT ?concept ?label (COUNT(?edge) AS ?score)
152
- # WHERE {{
153
- # ?concept skos:prefLabel|rdfs:label|skos:altLabel|obo:hasRelatedSynonym ?label .
154
- # FILTER (LCASE(STR(?label)) = "{word}")
155
- # ?concept ?edge ?o .
156
- # }}
157
- # GROUP BY ?concept ?label
158
- # ORDER BY DESC(?score)
159
- # """
160
 
161
- choices = ['SNOMED', 'LOINC', 'ICD10', 'MESH', 'NCIT'] # restricts the input to these values only
 
 
 
 
 
162
 
163
  # Construct the FROM clauses
164
  from_clauses = ' '.join([f"FROM <{choice}>" for choice in choices])
165
 
166
- word = "acute sinusitis"
 
167
  # Construct the full SPARQL query
168
  query = f"""
 
169
  SELECT ?concept ?label (COUNT(?edge) AS ?score)
170
  {from_clauses}
171
  WHERE {{
172
- ?concept skos:prefLabel|rdfs:label|skos:altLabel|obo:hasRelatedSynonym ?label .
173
  FILTER (LCASE(STR(?label)) = "{word.lower()}")
174
  ?concept ?edge ?o .
175
  }}
 
3
  import ssl
4
  import json
5
 
6
+ from joblib import Memory
7
 
8
+ cachedir = 'cached'
9
+ mem = Memory(cachedir, verbose=False)
10
 
11
 
12
 
13
+ @mem.cache
14
  def execute_query(endpoint, query, auth):
15
  headers = {
16
  'Content-Type': 'application/x-www-form-urlencoded',
 
147
 
148
  #############
149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
+ #choices = ['SNOMED', 'LOINC', 'ICD10', 'MESH', 'NCIT'] # restricts the input to these values only
152
+ choices = ["AI", "AIO", "AEO", "BFO", "BIM", "BCGO", "CL", "CHIRO", "CHEBI", "DCM", "FMA", "GO", "GENO",
153
+ "GeoSPARQL", "HL7", "DOID", "HP", "HP_O", "IDO", "IAO", "ICD10", "LOINC", "MESH",
154
+ "MONDO", "NCIT", "NCBITAXON", "NCBITaxon_", "NIFCELL", "NIFSTD", "GML", "OBCS", "OCHV", "OHPI",
155
+ "OPB", "TRANS", "PLOSTHES", "RADLEX", "RO", "STY", "SO", "SNOMED", "STATO",
156
+ "SYMP", "FoodOn", "UBERON", "ORDO", "HOOM", "VO", "OGMS", "EuroSciVoc"]
157
 
158
  # Construct the FROM clauses
159
  from_clauses = ' '.join([f"FROM <{choice}>" for choice in choices])
160
 
161
+ #word = "acute sinusitis"
162
+ word = "pure mathematics"
163
  # Construct the full SPARQL query
164
  query = f"""
165
+ prefix skosxl: <http://www.w3.org/2008/05/skos-xl#>
166
  SELECT ?concept ?label (COUNT(?edge) AS ?score)
167
  {from_clauses}
168
  WHERE {{
169
+ ?concept skos:prefLabel|rdfs:label|skos:altLabel|skosxl:literalForm|obo:hasRelatedSynonym ?label .
170
  FILTER (LCASE(STR(?label)) = "{word.lower()}")
171
  ?concept ?edge ?o .
172
  }}