Spaces:
Sleeping
Sleeping
Consoli Sergio
commited on
Commit
·
232b620
1
Parent(s):
2800a02
major commit for change to interface to gradio Blocks
Browse files- .gitignore +116 -0
- app-demo-myMultiNER.py +826 -0
- common.py +2 -1
- llmqueryNer.py +881 -0
- nerBio.py +0 -0
- retrieverRAG_testing.py +339 -0
- virtuosoQueryRest.py +14 -17
.gitignore
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
|
4 |
+
.idea/
|
5 |
+
|
6 |
+
config.py
|
7 |
+
|
8 |
+
.gradio
|
9 |
+
.gradio/
|
10 |
+
.gradio/*
|
11 |
+
|
12 |
+
#
|
13 |
+
# Project specific excludes
|
14 |
+
#
|
15 |
+
|
16 |
+
*.log
|
17 |
+
*.key
|
18 |
+
*.env
|
19 |
+
|
20 |
+
*.csv
|
21 |
+
*.xlsx
|
22 |
+
|
23 |
+
med_news.txt
|
24 |
+
|
25 |
+
example_sergio.sh
|
26 |
+
example_sergio_summary.sh
|
27 |
+
|
28 |
+
screenlog.0
|
29 |
+
|
30 |
+
/DONS/
|
31 |
+
/UCI_ML_Repository/
|
32 |
+
/cached/
|
33 |
+
/DONS/*
|
34 |
+
/UCI_ML_Repository/*
|
35 |
+
/cached/*
|
36 |
+
|
37 |
+
/prove/
|
38 |
+
/prove/*
|
39 |
+
|
40 |
+
*.json
|
41 |
+
|
42 |
+
/__pycache__/
|
43 |
+
/__pychache__/*
|
44 |
+
|
45 |
+
/vast_api_logs/
|
46 |
+
/vast_api_logs/*
|
47 |
+
|
48 |
+
*.tpl
|
49 |
+
|
50 |
+
|
51 |
+
./.settings/
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
*.Rhistory
|
57 |
+
*.Rproj
|
58 |
+
*.RData
|
59 |
+
|
60 |
+
tomcat
|
61 |
+
|
62 |
+
#
|
63 |
+
# Default excludes
|
64 |
+
#
|
65 |
+
|
66 |
+
# Binaries
|
67 |
+
*.7z
|
68 |
+
*.dmg
|
69 |
+
*.gz
|
70 |
+
*.iso
|
71 |
+
*.jar
|
72 |
+
*.rar
|
73 |
+
*.tar
|
74 |
+
*.zip
|
75 |
+
*.war
|
76 |
+
*.ear
|
77 |
+
*.sar
|
78 |
+
*.class
|
79 |
+
|
80 |
+
# Maven
|
81 |
+
target/
|
82 |
+
|
83 |
+
# IntelliJ project files
|
84 |
+
*.iml
|
85 |
+
*.iws
|
86 |
+
*.ipr
|
87 |
+
.idea/
|
88 |
+
|
89 |
+
# eclipse project file
|
90 |
+
.settings/
|
91 |
+
.classpath
|
92 |
+
.project
|
93 |
+
|
94 |
+
# NetBeans specific
|
95 |
+
nbproject/private/
|
96 |
+
build/
|
97 |
+
nbbuild/
|
98 |
+
dist/
|
99 |
+
nbdist/
|
100 |
+
nbactions.xml
|
101 |
+
nb-configuration.xml
|
102 |
+
|
103 |
+
|
104 |
+
# OS
|
105 |
+
.DS_Store
|
106 |
+
|
107 |
+
# Misc
|
108 |
+
*.swp
|
109 |
+
release.properties
|
110 |
+
pom.xml.releaseBackup
|
111 |
+
pom.xml.tag
|
112 |
+
__pycache__
|
113 |
+
|
114 |
+
.Rproj.user
|
115 |
+
|
116 |
+
/bin/
|
app-demo-myMultiNER.py
ADDED
@@ -0,0 +1,826 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "1,6" # to use the GPUs 3,4 only
|
4 |
+
|
5 |
+
os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
|
6 |
+
os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
|
7 |
+
os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
|
8 |
+
|
9 |
+
from transformers import file_utils
|
10 |
+
print(file_utils.default_cache_path)
|
11 |
+
|
12 |
+
import pandas as pd
|
13 |
+
from tqdm import tqdm
|
14 |
+
from gliner import GLiNER
|
15 |
+
import logging
|
16 |
+
from jinja2 import Template
|
17 |
+
from collections import Counter
|
18 |
+
|
19 |
+
from transformers import pipeline, AutoTokenizer
|
20 |
+
|
21 |
+
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
|
22 |
+
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
|
23 |
+
|
24 |
+
#import html
|
25 |
+
|
26 |
+
import torch
|
27 |
+
torch.cuda.empty_cache() # Clear cache ot torch
|
28 |
+
|
29 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
30 |
+
print(f"Device: {device}...")
|
31 |
+
if device.type == "cuda":
|
32 |
+
print("GPU number:", torch.cuda.current_device())
|
33 |
+
|
34 |
+
import datasets
|
35 |
+
|
36 |
+
import argparse
|
37 |
+
import json
|
38 |
+
import random
|
39 |
+
import numpy as np
|
40 |
+
|
41 |
+
import tiktoken
|
42 |
+
from langchain.text_splitter import TokenTextSplitter
|
43 |
+
|
44 |
+
import gradio as gr
|
45 |
+
import re
|
46 |
+
from common import strtobool, token_counter, encoding_getter, strip_quotes
|
47 |
+
from nerBio import annotate, entitiesFusion, is_cross_inside, elinking
|
48 |
+
from llmqueryNer import call_model, call_model_with_caching, process_list, setup_gptjrc, api_call_gptjrc, model_list_gptjrc
|
49 |
+
|
50 |
+
|
51 |
+
from joblib import Memory
|
52 |
+
|
53 |
+
cachedir = 'cached'
|
54 |
+
mem = Memory(cachedir, verbose=False)
|
55 |
+
|
56 |
+
# this is to completely delete the cache:
|
57 |
+
# mem.clear(warn=False)
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
|
63 |
+
examples = [
|
64 |
+
["He said the disease was 1st detected to have spread to Malaysia from Africa in 1997. " , None],
|
65 |
+
["He said the disease which was 1st detected in Johor had spread to Negeri Sembilan, Melaka, Perak, Selangor and the latest Kedah. He said the disease was 1st detected to have spread to Malaysia from Africa in 1997. ", None],
|
66 |
+
["The Health Ministry has detected about 4000 suspected chikungunya cases nationwide this year [2008], Minister Datuk Liow Tiong Lai said Wednesday [17 Dec 2008]. ", None],
|
67 |
+
["The Health Ministry has detected about 4000 suspected chikungunya cases nationwide this year [2008], Minister Datuk Liow Tiong Lai said Wednesday [17 Dec 2008]. He said the disease which was 1st detected in Johor had spread to Negeri Sembilan, Melaka, Perak, Selangor and the latest Kedah. \"So far, the chikungunya disease is still under control nationwide,\" he told reporters after visiting Sultanah Nur Zahirah Hospital here. Present was Terengganu Health Director Dr. Nordiyanah Hassan. Liow said that so far, there is no specific medicine to treat the chikungunya fever disease spread by _Aedes_ mosquito. \"So, I would like to call on the public to be careful particularly during the wet season now because _Aedes_ mosquito is easy to breed,\" he said. To contain the spread of the disease, he said, the ministry had taken several measures including intensifying the campaign to rid of _Aedes_ mosquito and holding lectures on the outbreak. He said the disease was 1st detected to have spread to Malaysia from Africa in 1997. Meanwhile, he said 63 health projects costing RM458 million [USD 131 230 211] had been approved for implementation in Terengganu under the Ninth Malaysia Plan and some had started.", None],
|
68 |
+
["Carcinoma", None],
|
69 |
+
["The doctor diagnosed the patient with basal cell carcinoma, a common type of skin cancer.", None],
|
70 |
+
["West Nile virus", None],
|
71 |
+
["Legionellosis", None],
|
72 |
+
["Eight years ago I started with Fosamax for 3-4 years and then took Actonel. In March, I decided not to take Actonel any longer. I had been on it for too long and was fearful of esophageal cancer and bone breakage. Now my doctor wants me to take the Prolia injections, which I am not going to do. I am not going to continue with any drugs. My bone density recently done was in the minuses. I do work with a personal trainer and execise daily. I am searching for alternative ways to deal with this problem.", None],
|
73 |
+
["Does Chicago have any stores and does Joe live here?", None],
|
74 |
+
["Cholera has been reported every week since November 1994. By 5 November 1995 at total of 12,344 with 245 deaths have been notified. Of these, 879 cases with 4 deaths were reported for the period 9 October to 5 November 1995. Control efforts have not succeeded in preventing the spread of the epidemic and when cases were detected on Sao Nicolau and Sal Islands in the period 9 October to 5 November all nine inhabited islands of Cap Verde had become infected. The last cholera epidemic in Cap Verde occurred in 1979. (See also Weekly Epidemiological Record No. 44, 3 November 1995) Côte d'Ivoire: A cholera outbreak which started in September 1995 caused 2,027 cases and 150 deaths up to 12 November 1995. The first cases were reported in Department de l'Ouest on 18 September 1995. Cases were subsequently reported in Department de Nord and most recently in Department du Centre and Department de Sud. The WHO Representative assisted in the organization of a team to visit the area and evaluate the situation as well as arranging for medical supplies. (1.12.95) Iran, Islamic Republic of,: Kordestan Province has been declared free of cholera. (1.12.95) Iraq: An outbreak of cholera reported from Sulaimaniyah Governorate in Northern Iraq has resulted in 519 cases, 264 of which have been confirmed, and 3 deaths to date. Vibrio cholerae O1 serotype Ogawa has been isolated. At the request of the Iraqi Ministry of Health, a WHO consultant has been sent to the area to assess and monitor the situation, provide guidance to the health authorities, and coordinate inputs by non-governmental organizations. WHO has also made available essential treatment supplies. An intensive media campaign to raise public awareness about essential preventive measures has been successful in containing the spread of the outbreak. (1.12.95) Senegal: Despite the fact that cholera has been endemic in countries bordering Senegal for the past two years, no cases were reported from Senegal until mid- August 1995. Between 15 August and 17 November 1995, 852 case and 43 deaths were notified. A further 731 cases with 37 deaths have been reported for the period 1 September to 12 November. Most cases were in the Departments of Dakar and Pikine in the Dakar Region and recently also Departments of Mbacke and Touba in Diourbel Region. ", None],
|
75 |
+
]
|
76 |
+
|
77 |
+
|
78 |
+
|
79 |
+
models_List = ["FacebookAI/xlm-roberta-large-finetuned-conll03-english", "Babelscape/wikineural-multilingual-ner", "blaze999/Medical-NER", "urchade/gliner_large-v2.1", "urchade/gliner_large_bio-v0.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1", "knowledgator/gliner-multitask-large-v0.5"
|
80 |
+
#models_List = ["Babelscape/wikineural-multilingual-ner", "blaze999/Medical-NER", "urchade/gliner_large-v2.1", "NCBO/BioPortal" ]
|
81 |
+
#models_List = ["NCBO/BioPortal" ]
|
82 |
+
|
83 |
+
#categories_List = ["MED","LOC","PER","ORG","DATE","MISC"]
|
84 |
+
categories_List = ["MED","LOC","PER","ORG","DATE","MISC", "CONC", "BIOP", "ACTI", "ANAT", "CHEM", "DEVI", "DISO", "GENE", "GEOG", "LIVB", "OBJC", "OCCU", "ORGA", "PHEN", "PHYS" , "PROC"]
|
85 |
+
|
86 |
+
POSSIBLE_KGchoices_List = ["AI", "AIO", "AEO", "BFO", "BIM", "BCGO", "CL", "CHIRO", "CHEBI", "DCM", "FMA", "GO", "GENO",
|
87 |
+
"GeoSPARQL", "HL7", "DOID", "HP", "HP_O", "IDO", "IAO", "ICD10", "LOINC", "MESH",
|
88 |
+
"MONDO", "NCIT", "NCBITAXON", "NCBITaxon_", "NIFCELL", "NIFSTD", "GML", "OBCS", "OCHV", "OHPI",
|
89 |
+
"OPB", "TRANS", "PLOSTHES", "RADLEX", "RO", "STY", "SO", "SNOMED", "STATO",
|
90 |
+
"SYMP", "FoodOn", "UBERON", "ORDO", "HOOM", "VO", "OGMS", "EuroSciVoc"]
|
91 |
+
|
92 |
+
|
93 |
+
modelGliner=None
|
94 |
+
modelGlinerBio=None
|
95 |
+
|
96 |
+
num_cores_Gliner_forDemo = 0 # 0 means use the GPU for Gliner !
|
97 |
+
tokenizerGliner = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')
|
98 |
+
|
99 |
+
|
100 |
+
encod = encoding_getter('microsoft/deberta-v3-large')
|
101 |
+
text_splitter = TokenTextSplitter(
|
102 |
+
# separators=separators,
|
103 |
+
encoding_name=encod.name,
|
104 |
+
chunk_size=80000,
|
105 |
+
chunk_overlap=50,
|
106 |
+
length_function=len,
|
107 |
+
add_start_index=True,
|
108 |
+
)
|
109 |
+
|
110 |
+
pipe_dict = {}
|
111 |
+
for modelName in models_List:
|
112 |
+
tsk = "token-classification"
|
113 |
+
if (("/gliner" in modelName) == False) and (("NCBO" in modelName) == False):
|
114 |
+
pipe = pipeline(
|
115 |
+
tsk,
|
116 |
+
model=modelName,
|
117 |
+
aggregation_strategy="simple",
|
118 |
+
device=device,
|
119 |
+
)
|
120 |
+
pipe_dict[modelName] = pipe
|
121 |
+
elif ("/gliner" in modelName):
|
122 |
+
if not tokenizerGliner:
|
123 |
+
tokenizerGliner = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')
|
124 |
+
if "_bio-" in modelName:
|
125 |
+
if num_cores_Gliner_forDemo > 0:
|
126 |
+
modelGlinerBio = GLiNER.from_pretrained(modelName) # "urchade/gliner_large_bio-v0.1")
|
127 |
+
else:
|
128 |
+
modelGlinerBio = GLiNER.from_pretrained(modelName, map_location=device)
|
129 |
+
else:
|
130 |
+
if num_cores_Gliner_forDemo > 0:
|
131 |
+
modelGliner = GLiNER.from_pretrained(
|
132 |
+
modelName) # "knowledgator/gliner-multitask-large-v0.5" - "urchade/gliner_large-v2.1"
|
133 |
+
else:
|
134 |
+
modelGliner = GLiNER.from_pretrained(modelName, map_location=device)
|
135 |
+
|
136 |
+
|
137 |
+
#### GPT@JRC API
|
138 |
+
#if args.service_provider == "gptjrc":
|
139 |
+
key_gptjrc = ""
|
140 |
+
fkeyname = "GPTJRC-APItoken.key"
|
141 |
+
if os.path.exists(fkeyname):
|
142 |
+
with open(fkeyname) as f:
|
143 |
+
key_gptjrc = f.read()
|
144 |
+
else:
|
145 |
+
key_gptjrc = os.environ['key_gptjrc']
|
146 |
+
setup_gptjrc(key_gptjrc)
|
147 |
+
#####
|
148 |
+
|
149 |
+
|
150 |
+
# Add this function to handle dropdown selection
|
151 |
+
def get_urls(word, df_annotated_combined):
|
152 |
+
# Filter the DataFrame to get rows where 'ALLURIScontext' is not empty or None
|
153 |
+
#valid_entries = df_annotated_combined[df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [])]
|
154 |
+
valid_entries = df_annotated_combined[df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [] and (
|
155 |
+
isinstance(x, list) and (isinstance(x, list) and len(x) > 0) and (not (len(x) == 1 and not str(x[0]).strip()))))]
|
156 |
+
|
157 |
+
# Check if the word is in the filtered DataFrame
|
158 |
+
if word in valid_entries['word'].values:
|
159 |
+
urls = valid_entries.loc[valid_entries['word'] == word, 'ALLURIScontext'].values[0]
|
160 |
+
|
161 |
+
if 'namedEntity' in df_annotated_combined.columns:
|
162 |
+
firsturlinlist = df_annotated_combined.loc[df_annotated_combined['word'] == word, 'namedEntity']
|
163 |
+
firsturlinlist = firsturlinlist.iloc[0] if not firsturlinlist.empty else None
|
164 |
+
if firsturlinlist and firsturlinlist in urls:
|
165 |
+
# Remove the URL from its current position
|
166 |
+
urls.remove(firsturlinlist)
|
167 |
+
# Insert the URL at the first position
|
168 |
+
urls.insert(0, firsturlinlist)
|
169 |
+
|
170 |
+
html_links = "<br>".join([f'<a href="https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={url}" target="_blank">{url}</a>' for url in urls])
|
171 |
+
#html_links = "<br>".join([f'<a href="https://api-vast.jrc.service.ec.europa.eu/describe//?url={url}" target="_blank">{url}</a>' for url in urls])
|
172 |
+
return html_links
|
173 |
+
return ""
|
174 |
+
|
175 |
+
|
176 |
+
|
177 |
+
|
178 |
+
###@mem.cache
|
179 |
+
def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking, KGchoices, state: dict):
|
180 |
+
|
181 |
+
if EntityLinking:
|
182 |
+
EnableNEL="True"
|
183 |
+
else:
|
184 |
+
EnableNEL="False"
|
185 |
+
|
186 |
+
if not text:
|
187 |
+
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
|
188 |
+
state = {
|
189 |
+
"text": "",
|
190 |
+
"df_annotated_dict": dict(),
|
191 |
+
"KGchoices": KGchoices
|
192 |
+
}
|
193 |
+
return {"text": text, "entities": []}, html_output, state, [], ""
|
194 |
+
|
195 |
+
df_annotated = pd.DataFrame()
|
196 |
+
|
197 |
+
parser = argparse.ArgumentParser()
|
198 |
+
|
199 |
+
parser.add_argument("--model_id", type=str, default=models_List[0], help="model to use")
|
200 |
+
|
201 |
+
parser.add_argument("--debug", type=str, default="True", help="set debug mode")
|
202 |
+
|
203 |
+
parser.add_argument("--source_column", type=str, default="ContextToAnnotate")
|
204 |
+
|
205 |
+
parser.add_argument("--entities_filter_threshold", type=int, default=ScoreFilt)
|
206 |
+
|
207 |
+
parser.add_argument("--SEED", type=int, default=41)
|
208 |
+
parser.add_argument("--batch_size", type=int, default=32) # 4 - 8 - 16
|
209 |
+
parser.add_argument("--num_cores_Gliner", type=int, default=num_cores_Gliner_forDemo, help="parallel processing for Gliner annotation") # 0 means use the GPU for Gliner !
|
210 |
+
|
211 |
+
parser.add_argument("--entity_linking", type=str, default=EnableNEL, help="whether to make entities linking or not")
|
212 |
+
parser.add_argument("--geonameskey_filename", type=str, default="GEONAMES-API.key", help="file location where it is stored the geonames api key")
|
213 |
+
parser.add_argument("--virtuosokey_filename", type=str, default="VIRTUOSO-dba.key", help="file location where it is stored the virtuoso endpoint dba pwd")
|
214 |
+
parser.add_argument("--bioportalkey_filename", type=str, default="NCBO-BioPortal.key", help="file location where it is stored the NCBO BioPortal api key")
|
215 |
+
|
216 |
+
# consose 20250205:
|
217 |
+
# KGchoices = None
|
218 |
+
# KGchoices = ['SNOMED', 'LOINC', 'ICD10', 'NCIT']
|
219 |
+
# KGchoices = ['SNOMED', 'LOINC', 'ICD10', 'MESH', 'NCIT'] # restricts the input to these values only
|
220 |
+
if KGchoices:
|
221 |
+
KGchoices.sort()
|
222 |
+
parser.add_argument("--KG_restriction", nargs='+', choices=KGchoices, default=KGchoices,
|
223 |
+
help="List of ontologies to which restrict the entity linking task.")
|
224 |
+
#consose 20250502:
|
225 |
+
if Counter(KGchoices) == Counter(POSSIBLE_KGchoices_List):
|
226 |
+
parser.add_argument("--USE_CACHE", type=str, default="False",
|
227 |
+
help="whether to use cache for the NER and NEL tasks or not")
|
228 |
+
else:
|
229 |
+
#print("Lists do not have the same elements")
|
230 |
+
parser.add_argument("--USE_CACHE", type=str, default="False",
|
231 |
+
help="whether to use cache for the NER and NEL tasks or not")
|
232 |
+
|
233 |
+
parser.add_argument("--num_cores_eLinking", type=int, default=10, help="parallel processing for the entity linking process")
|
234 |
+
|
235 |
+
parser.add_argument("--computeEntityContext", type=str, default="False",
|
236 |
+
help="whether to extract a readable context from the extracted triples for the concept")
|
237 |
+
parser.add_argument("--computeEntityGlobalContext", type=str, default="False",
|
238 |
+
help="whether to extract a readable context from the extracted triples of all the entities extracted from the endpoint for the concept")
|
239 |
+
parser.add_argument("--UseRetrieverForContextCreation", type=str, default="True",
|
240 |
+
help="whether to use a retriever for the creation of the context of the entities from the triples coming from the KGs")
|
241 |
+
|
242 |
+
parser.add_argument("--service_provider", type=str, default="gptjrc", help="llm service provider")
|
243 |
+
parser.add_argument("--model_name", type=str, default="llama-3.1-70b-instruct", help="llm to use")
|
244 |
+
parser.add_argument("--tokens_max", type=int, default=80000, help="max number of tokens to supply to the llm")
|
245 |
+
|
246 |
+
parser.add_argument("--temperature", type=int, default=0.01)
|
247 |
+
|
248 |
+
|
249 |
+
args = parser.parse_args()
|
250 |
+
|
251 |
+
df_ToAnnotate = pd.DataFrame()
|
252 |
+
|
253 |
+
previous_text = ""
|
254 |
+
previous_df_annotated_dict = dict()
|
255 |
+
previous_kg_choices = []
|
256 |
+
if state:
|
257 |
+
previous_text = state.get("text", "")
|
258 |
+
previous_df_annotated_dict = state.get("df_annotated_dict", {})
|
259 |
+
previous_kg_choices = state.get("KGchoices", [])
|
260 |
+
|
261 |
+
#print("Are all models in any row of the 'model' column, case-insensitively?", all_models_in_any_row)
|
262 |
+
#if (not history_dict) or (history_dict[args.source_column][0] != text) or (all_models_in_any_row == False):
|
263 |
+
#if (not history_dict) or (history_dict[args.source_column][0] != text):
|
264 |
+
if (not previous_df_annotated_dict) or (previous_text != text) or (sorted(previous_kg_choices) != sorted(KGchoices) ):
|
265 |
+
|
266 |
+
for model_id in models_List: # always do all the annotations, only filter them afterwards
|
267 |
+
#for model_id in ModelsSelection:
|
268 |
+
|
269 |
+
# if history_dict and (history_dict[args.source_column][0] == text):
|
270 |
+
# if model_id in hhist['model'].unique():
|
271 |
+
# continue
|
272 |
+
|
273 |
+
parser.set_defaults(model_id=model_id)
|
274 |
+
|
275 |
+
args = parser.parse_args()
|
276 |
+
|
277 |
+
print("ARGS:")
|
278 |
+
print(args)
|
279 |
+
|
280 |
+
# %% n machine learning tasks, particularly when dealing with models that have stochasticity involved (like text generation), it's important to set seeds for random number generators to ensure reproducibility of results. In the case of using models from the transformers library, you need to set seeds for both Python's random module, NumPy, and PyTorch to ensure that the results are the same every time you run the code.
|
281 |
+
# Before you create the pipeline and run the text generation, set the seeds like this:
|
282 |
+
random.seed(args.SEED)
|
283 |
+
np.random.seed(args.SEED)
|
284 |
+
torch.manual_seed(args.SEED)
|
285 |
+
torch.cuda.manual_seed_all(args.SEED)
|
286 |
+
###
|
287 |
+
|
288 |
+
df_ToAnnotate = pd.DataFrame({ "ToLink": [None], args.source_column: [text]})
|
289 |
+
|
290 |
+
if "SentenceRef" not in df_ToAnnotate.columns:
|
291 |
+
df_ToAnnotate["SentenceRef"] = None
|
292 |
+
df_ToAnnotate = df_ToAnnotate[['SentenceRef'] + [col for col in df_ToAnnotate.columns if
|
293 |
+
col != 'SentenceRef']] # this moves it to the first position
|
294 |
+
|
295 |
+
df_ToAnnotate['SentenceRef'] = df_ToAnnotate.index + 1
|
296 |
+
df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].argsort().groupby(df_ToAnnotate[args.source_column]).transform('min').astype(int)
|
297 |
+
df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].rank(method='dense').astype(int)
|
298 |
+
|
299 |
+
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
300 |
+
# if strtobool(args.debug):
|
301 |
+
# print(f"Device: {device}...")
|
302 |
+
# if device.type == "cuda":
|
303 |
+
# print("GPU number:", torch.cuda.current_device())
|
304 |
+
|
305 |
+
pipeToUse = None
|
306 |
+
if (("gliner" in args.model_id) == False) and (("NCBO" in args.model_id)== False) :
|
307 |
+
pipeToUse = pipe_dict[args.model_id]
|
308 |
+
|
309 |
+
new_annotations = annotate(df_ToAnnotate, args, pipeToUse, tokenizerGliner, modelGliner, modelGlinerBio, device)
|
310 |
+
if not new_annotations.empty:
|
311 |
+
if df_annotated.empty:
|
312 |
+
# If df_annotated is empty, just assign new_annotations to it
|
313 |
+
df_annotated = new_annotations
|
314 |
+
else:
|
315 |
+
# If df_annotated is not empty, concatenate new_annotations to it
|
316 |
+
df_annotated = pd.concat([df_annotated, new_annotations], ignore_index=True)
|
317 |
+
|
318 |
+
state = {
|
319 |
+
"text": text,
|
320 |
+
"df_annotated_dict": df_annotated.to_dict(),
|
321 |
+
"KGchoices": KGchoices
|
322 |
+
}
|
323 |
+
|
324 |
+
else:
|
325 |
+
|
326 |
+
print("ARGS:")
|
327 |
+
print(args)
|
328 |
+
|
329 |
+
# %% n machine learning tasks, particularly when dealing with models that have stochasticity involved (like text generation), it's important to set seeds for random number generators to ensure reproducibility of results. In the case of using models from the transformers library, you need to set seeds for both Python's random module, NumPy, and PyTorch to ensure that the results are the same every time you run the code.
|
330 |
+
# Before you create the pipeline and run the text generation, set the seeds like this:
|
331 |
+
random.seed(args.SEED)
|
332 |
+
np.random.seed(args.SEED)
|
333 |
+
torch.manual_seed(args.SEED)
|
334 |
+
torch.cuda.manual_seed_all(args.SEED)
|
335 |
+
###
|
336 |
+
|
337 |
+
history = pd.DataFrame(previous_df_annotated_dict)
|
338 |
+
df_annotated = history.copy()
|
339 |
+
|
340 |
+
state = {
|
341 |
+
"text": text,
|
342 |
+
"df_annotated_dict": df_annotated.to_dict(),
|
343 |
+
"KGchoices": KGchoices
|
344 |
+
}
|
345 |
+
|
346 |
+
|
347 |
+
quoted_text = text.startswith('"') & text.endswith('"')
|
348 |
+
if (not df_annotated.empty) or quoted_text:
|
349 |
+
|
350 |
+
if (not df_annotated.empty):
|
351 |
+
# filter now per models selection
|
352 |
+
df_annotated = df_annotated[df_annotated['model'].str.lower().isin([model.lower() for model in ModelsSelection])]
|
353 |
+
if df_annotated.empty and quoted_text==False:
|
354 |
+
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
|
355 |
+
return {"text": text, "entities": []}, html_output, state, [], ""
|
356 |
+
|
357 |
+
df_annotated_combined = pd.DataFrame()
|
358 |
+
if (not df_annotated.empty):
|
359 |
+
df_annotated_combined = entitiesFusion(df_annotated,args)
|
360 |
+
if df_annotated_combined.empty and quoted_text==False:
|
361 |
+
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
|
362 |
+
return {"text": text, "entities": []}, html_output, state, [], ""
|
363 |
+
else:
|
364 |
+
if (not df_annotated.empty):
|
365 |
+
df_annotated_combined = is_cross_inside(df_annotated_combined, args, 0.999) #I cut all the cross inside with the 0.99. to avoid the linking
|
366 |
+
|
367 |
+
|
368 |
+
cache_prefix_fp = "LLMQUERYNER"
|
369 |
+
cache_nameLLMs = cache_prefix_fp + "___" + "__".join(
|
370 |
+
[args.service_provider, args.model_name, str(args.temperature)]).replace(
|
371 |
+
" ", "_") + ".json"
|
372 |
+
|
373 |
+
load_map_query_input_output = None
|
374 |
+
if strtobool(args.USE_CACHE):
|
375 |
+
if os.path.exists(cache_nameLLMs):
|
376 |
+
with open(cache_nameLLMs) as f:
|
377 |
+
load_map_query_input_output = json.load(f)
|
378 |
+
else:
|
379 |
+
load_map_query_input_output = {}
|
380 |
+
|
381 |
+
### entity linking part:
|
382 |
+
if strtobool(args.entity_linking):
|
383 |
+
|
384 |
+
cache_map_geonames = None
|
385 |
+
if strtobool(args.USE_CACHE):
|
386 |
+
cache_filename = "CACHE_geonames.json"
|
387 |
+
if os.path.exists(cache_filename):
|
388 |
+
with open(cache_filename) as f:
|
389 |
+
cache_map_geonames = json.load(f)
|
390 |
+
else:
|
391 |
+
cache_map_geonames = {}
|
392 |
+
|
393 |
+
key_geonames = ""
|
394 |
+
if args.geonameskey_filename:
|
395 |
+
fkeyname = args.geonameskey_filename
|
396 |
+
with open(fkeyname) as f:
|
397 |
+
key_geonames = f.read()
|
398 |
+
else:
|
399 |
+
key_geonames = os.environ['key_geonames']
|
400 |
+
|
401 |
+
cache_map_virtuoso = None
|
402 |
+
if strtobool(args.USE_CACHE):
|
403 |
+
cacheVirtuoso_filename = "CACHE_virtuoso.json"
|
404 |
+
if os.path.exists(cacheVirtuoso_filename):
|
405 |
+
with open(cacheVirtuoso_filename) as f:
|
406 |
+
cache_map_virtuoso = json.load(f)
|
407 |
+
else:
|
408 |
+
cache_map_virtuoso = {}
|
409 |
+
|
410 |
+
key_virtuoso = ""
|
411 |
+
if args.virtuosokey_filename:
|
412 |
+
fkeyname = args.virtuosokey_filename
|
413 |
+
with open(fkeyname) as f:
|
414 |
+
key_virtuoso = f.read()
|
415 |
+
else:
|
416 |
+
key_virtuoso = os.environ['key_virtuoso']
|
417 |
+
|
418 |
+
|
419 |
+
# Here for the EXACT MATCHING "" - if the desired term has not been identified in the NER, add to the dataframe:
|
420 |
+
|
421 |
+
if df_ToAnnotate.empty:
|
422 |
+
df_ToAnnotate = pd.DataFrame({"ToLink": [None], args.source_column: [text]})
|
423 |
+
|
424 |
+
if "SentenceRef" not in df_ToAnnotate.columns:
|
425 |
+
df_ToAnnotate["SentenceRef"] = None
|
426 |
+
df_ToAnnotate = df_ToAnnotate[['SentenceRef'] + [col for col in df_ToAnnotate.columns if
|
427 |
+
col != 'SentenceRef']] # this moves it to the first position
|
428 |
+
|
429 |
+
df_ToAnnotate['SentenceRef'] = df_ToAnnotate.index + 1
|
430 |
+
df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].argsort().groupby(
|
431 |
+
df_ToAnnotate[args.source_column]).transform('min').astype(int)
|
432 |
+
df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].rank(method='dense').astype(int)
|
433 |
+
|
434 |
+
# Define the condition to find missing SentenceRefs
|
435 |
+
missing_sentence_refs = ~df_ToAnnotate['SentenceRef'].isin(df_annotated_combined['SentenceRef'])
|
436 |
+
|
437 |
+
# Define the condition to check if ContextToAnnotate starts and ends with quotes
|
438 |
+
quoted_context = df_ToAnnotate[args.source_column].str.startswith('"') & df_ToAnnotate[
|
439 |
+
args.source_column].str.endswith('"')
|
440 |
+
|
441 |
+
# Combine both conditions
|
442 |
+
condition = missing_sentence_refs & quoted_context
|
443 |
+
|
444 |
+
# Select rows from df_ToAnnotate that meet the condition
|
445 |
+
rows_to_add = df_ToAnnotate[condition]
|
446 |
+
|
447 |
+
rows_to_add['model'] = "Forced"
|
448 |
+
rows_to_add['entity_group'] = "MISC"
|
449 |
+
rows_to_add['word'] = rows_to_add[args.source_column]
|
450 |
+
rows_to_add['word'] = rows_to_add[args.source_column].apply(strip_quotes)
|
451 |
+
rows_to_add['score'] = 1.0
|
452 |
+
rows_to_add['start'] = int(1)
|
453 |
+
rows_to_add['end'] = rows_to_add['word'].apply(len) + int(1)
|
454 |
+
rows_to_add['IsGeo'] = None
|
455 |
+
rows_to_add['IsBio'] = None
|
456 |
+
rows_to_add['IsCrossInside'] = 0.0
|
457 |
+
|
458 |
+
if df_annotated_combined.empty:
|
459 |
+
df_annotated_combined = pd.DataFrame(columns=df_ToAnnotate.columns)
|
460 |
+
|
461 |
+
# Append these rows to df_annotated_combined
|
462 |
+
df_annotated_combined = pd.concat([df_annotated_combined, rows_to_add], ignore_index=True)
|
463 |
+
|
464 |
+
df_annotated_combined['start'] = df_annotated_combined['start'].astype(int)
|
465 |
+
df_annotated_combined['end'] = df_annotated_combined['end'].astype(int)
|
466 |
+
|
467 |
+
df_annotated_combined = df_annotated_combined.sort_values(
|
468 |
+
by=['SentenceRef', 'start', 'ToLink', 'word', 'score'],
|
469 |
+
ascending=[True, True, True, True, False])
|
470 |
+
|
471 |
+
# Now df_annotated_combined contains the additional rows
|
472 |
+
|
473 |
+
df_annotated_combined, cache_map_geonames_AFTER, cache_map_virtuoso_AFTER, load_map_query_input_output_AFTER = elinking(df_annotated_combined,
|
474 |
+
text_splitter, args, key_geonames,
|
475 |
+
cache_map_geonames,
|
476 |
+
key_virtuoso,
|
477 |
+
cache_map_virtuoso,
|
478 |
+
load_map_query_input_output,
|
479 |
+
device)
|
480 |
+
|
481 |
+
if strtobool(args.USE_CACHE):
|
482 |
+
if cache_map_geonames_AFTER is not None:
|
483 |
+
with open(cache_filename, "w") as f:
|
484 |
+
json.dump(cache_map_geonames_AFTER, f)
|
485 |
+
|
486 |
+
if cache_map_virtuoso_AFTER is not None:
|
487 |
+
with open(cacheVirtuoso_filename, "w") as f:
|
488 |
+
json.dump(cache_map_virtuoso_AFTER, f)
|
489 |
+
|
490 |
+
if load_map_query_input_output_AFTER is not None:
|
491 |
+
with open(cache_nameLLMs, "w") as f:
|
492 |
+
json.dump(load_map_query_input_output_AFTER, f)
|
493 |
+
|
494 |
+
### end entity linking part
|
495 |
+
|
496 |
+
|
497 |
+
### filter by selected category only
|
498 |
+
# #df_annotated_combined = df_annotated_combined[df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in CategoriesSelection])]
|
499 |
+
# if "MED" in CategoriesSelection:
|
500 |
+
# filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
|
501 |
+
# [cat.lower() for cat in CategoriesSelection]) | (df_annotated_combined['IsBio'] == 1)
|
502 |
+
# else:
|
503 |
+
# filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
|
504 |
+
# [cat.lower() for cat in CategoriesSelection])
|
505 |
+
# df_annotated_combined = df_annotated_combined[filter_mask]
|
506 |
+
#
|
507 |
+
# if "MED" in CategoriesSelection:
|
508 |
+
# filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
|
509 |
+
# [cat.lower() for cat in CategoriesSelection]) | (df_annotated_combined['IsBio'] == 1)
|
510 |
+
# elif "OTHER" in CategoriesSelection:
|
511 |
+
# filter_mask = ~(
|
512 |
+
# df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List]))
|
513 |
+
# else:
|
514 |
+
# filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
|
515 |
+
# [cat.lower() for cat in CategoriesSelection])
|
516 |
+
|
517 |
+
filter_mask = df_annotated_combined['entity_group'].str.lower().isin(
|
518 |
+
[cat.lower() for cat in CategoriesSelection])
|
519 |
+
if "MED" in CategoriesSelection:
|
520 |
+
filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) & (df_annotated_combined['IsBio'] == 1)
|
521 |
+
if "MISC" in CategoriesSelection:
|
522 |
+
# filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List]))
|
523 |
+
# filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) & ~(df_annotated_combined['IsBio'] == 1) # with this cluase, I'm including not only the categories labelled as MISC, but also the other that are not MED, PER, ORG, LOC
|
524 |
+
filter_mask |= ~(
|
525 |
+
df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) & ~(
|
526 |
+
df_annotated_combined[
|
527 |
+
'IsBio'] == 1) # with this cluase, I'm including not only the categories labelled as MISC, but also the other that are not MED, PER, ORG, LOC
|
528 |
+
|
529 |
+
df_annotated_combined = df_annotated_combined[filter_mask]
|
530 |
+
if df_annotated_combined.empty:
|
531 |
+
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
|
532 |
+
return {"text": text, "entities": []}, html_output, state, [], ""
|
533 |
+
|
534 |
+
###
|
535 |
+
|
536 |
+
#df_annotated_combined = is_cross_inside(df_annotated_combined, args)
|
537 |
+
|
538 |
+
if 'IsCrossInside' in df_annotated_combined.columns:
|
539 |
+
df_annotated_combined = df_annotated_combined[df_annotated_combined['IsCrossInside'] != 1]
|
540 |
+
if df_annotated_combined.empty:
|
541 |
+
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
|
542 |
+
return {"text": text, "entities": []}, html_output, state, [], ""
|
543 |
+
|
544 |
+
dict_annotated_combined_NER = df_annotated_combined[["end", "entity_group", "score", "start", "word"]].to_dict(orient="records")
|
545 |
+
|
546 |
+
### continue linking part:
|
547 |
+
if strtobool(args.entity_linking):
|
548 |
+
# ##### this is to pass the links:
|
549 |
+
|
550 |
+
# # Create a new column for the entities with links
|
551 |
+
df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
|
552 |
+
lambda row: (
|
553 |
+
f"<a href='https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={row['namedEntity']}' target='_blank'>{row['word']}</a>"
|
554 |
+
if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[
|
555 |
+
'word']
|
556 |
+
),
|
557 |
+
#lambda row: (
|
558 |
+
# f"<a href='https://api-vast.jrc.service.ec.europa.eu/describe//?url={row['namedEntity']}' target='_blank'>{row['word']}</a>"
|
559 |
+
# if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[
|
560 |
+
# 'word']
|
561 |
+
#),
|
562 |
+
axis=1
|
563 |
+
)
|
564 |
+
|
565 |
+
# Create a new dictionary with the entity information and the link
|
566 |
+
dict_annotated_combined_NEL = df_annotated_combined[
|
567 |
+
["end", "entity_group", "score", "start", "entity_with_link"]].to_dict(orient="records")
|
568 |
+
|
569 |
+
# Sort the entities by their start index
|
570 |
+
dict_annotated_combined_NEL.sort(key=lambda x: x['start'])
|
571 |
+
|
572 |
+
# Create a dictionary to map entity groups to colors
|
573 |
+
entity_colors = {
|
574 |
+
"MED": "#E6E6E6",
|
575 |
+
"PER": "#FFC0CB",
|
576 |
+
"ORG": "#C6F4D6",
|
577 |
+
"LOC": "#FFFFCC",
|
578 |
+
"MISC": "#F5DEB3"
|
579 |
+
}
|
580 |
+
|
581 |
+
text_with_links = text
|
582 |
+
offset = 0
|
583 |
+
for entity in dict_annotated_combined_NEL:
|
584 |
+
start = entity["start"] + offset
|
585 |
+
end = entity["end"] + offset
|
586 |
+
entity_text = entity["entity_with_link"]
|
587 |
+
text_with_links = text_with_links[:start] + entity_text + text_with_links[end:]
|
588 |
+
offset += len(entity_text) - (end - start)
|
589 |
+
|
590 |
+
# # Create the text with entities highlighted and linked
|
591 |
+
# text_with_links = text
|
592 |
+
# offset = 0
|
593 |
+
# for entity in dict_annotated_combined_NEL:
|
594 |
+
# start = entity["start"] + offset
|
595 |
+
# end = entity["end"] + offset
|
596 |
+
# entity_text = entity["entity_with_link"]
|
597 |
+
# entity_group = entity["entity_group"]
|
598 |
+
#
|
599 |
+
# color = entity_colors.get(entity_group, "#dbeafe") # Default
|
600 |
+
# darker_color = "#008080"
|
601 |
+
#
|
602 |
+
# if "https:" in entity_text:
|
603 |
+
# text_with_links = text_with_links[
|
604 |
+
# :start] + f'<span style="background-color: {color}; border-radius: 2px; padding: 2px 4px"><a style="color: {darker_color}" href="{entity_text.split(">")[1].split("<")[0]}">{entity_text.split(">")[1].split("<")[0]}</a> <span style="color: {darker_color}; font-size: 0.8em">{entity_group}</span></span>' + text_with_links[
|
605 |
+
# end:]
|
606 |
+
# offset += len(
|
607 |
+
# f'<span style="background-color: {color}; border-radius: 2px; padding: 2px 4px"><a style="color: {darker_color}" href="{entity_text.split(">")[1].split("<")[0]}">{entity_text.split(">")[1].split("<")[0]}</a> <span style="color: {darker_color}; font-size: 0.8em">{entity_group}</span></span>') - (
|
608 |
+
# end - start)
|
609 |
+
# # text_with_links = text_with_links[:start] + f'<span style="background-color: {color}"><a href="{entity_text.split(">")[1].split("<")[0]}">{entity_text.split(">")[1].split("<")[0]}</a></span>' + text_with_links[end:]
|
610 |
+
# # offset += len(
|
611 |
+
# # f'<span style="background-color: {color}"><a href="{entity_text.split(">")[1].split("<")[0]}">{entity_text.split(">")[1].split("<")[0]}</a></span>') - (
|
612 |
+
# # end - start)
|
613 |
+
# #
|
614 |
+
# # text_with_links = text_with_links[:start] + entity_text + text_with_links[end:]
|
615 |
+
# # offset += len(entity_text) - (end - start)
|
616 |
+
# else:
|
617 |
+
# text_with_links = text_with_links[
|
618 |
+
# :start] + f'<span style="background-color: {color}; border-radius: 2px; padding: 2px 4px">{entity_text} <span style="color: {darker_color}; font-size: 0.8em">{entity_group}</span></span>' + text_with_links[end:]
|
619 |
+
# offset += len(
|
620 |
+
# f'<span style="background-color: {color}; border-radius: 2px; padding: 2px 4px">{entity_text} <span style="color: {darker_color}; font-size: 0.8em">{entity_group}</span></span>') - (
|
621 |
+
# end - start)
|
622 |
+
# # text_with_links = text_with_links[
|
623 |
+
# # :start] + f'<span style="background-color: {color}">{entity_text}</span>' + text_with_links[
|
624 |
+
# # end:]
|
625 |
+
# # offset += len(
|
626 |
+
# # f'<span style="background-color: {color}">{entity_text}</span>') - (end - start)
|
627 |
+
|
628 |
+
|
629 |
+
# Update state with the DataFrame
|
630 |
+
state["df_annotated_combined_dict"] = df_annotated_combined.to_dict()
|
631 |
+
|
632 |
+
if 'ALLURIScontext' in df_annotated_combined.columns:
|
633 |
+
# words_for_dropdown = df_annotated_combined[
|
634 |
+
# df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [])][
|
635 |
+
# 'word'].unique().tolist()
|
636 |
+
words_for_dropdown = df_annotated_combined[df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [] and (isinstance(x, list) and len(x) > 0) and (isinstance(x, list) and (not (len(x) == 1 and not str(x[0]).strip())) ))]['word'].unique().tolist()
|
637 |
+
words_for_dropdown = list({entry.lower(): entry for entry in words_for_dropdown}.values())
|
638 |
+
words_for_dropdown.insert(0, "")
|
639 |
+
else:
|
640 |
+
words_for_dropdown = []
|
641 |
+
|
642 |
+
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text_with_links}</div>"
|
643 |
+
|
644 |
+
#return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state
|
645 |
+
return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state, gr.update(choices=words_for_dropdown), ""
|
646 |
+
|
647 |
+
else:
|
648 |
+
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
|
649 |
+
return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state, [], ""
|
650 |
+
|
651 |
+
else:
|
652 |
+
|
653 |
+
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
|
654 |
+
return {"text": text, "entities": []}, html_output, state, [], ""
|
655 |
+
|
656 |
+
|
657 |
+
# "FacebookAI/xlm-roberta-large-finetuned-conll03-english", "Babelscape/wikineural-multilingual-ner", "blaze999/Medical-NER", "urchade/gliner_large-v2.1", "urchade/gliner_large_bio-v0.1"
|
658 |
+
|
659 |
+
|
660 |
+
def update_urls(selected_word, state):
|
661 |
+
if "df_annotated_combined_dict" in state:
|
662 |
+
# Convert the state dictionary back into a DataFrame
|
663 |
+
df = pd.DataFrame(state["df_annotated_combined_dict"])
|
664 |
+
|
665 |
+
# # Filter the DataFrame to get rows where 'ALLURIScontextFromNCBO' is not empty or None
|
666 |
+
# valid_entries = df[df['ALLURIScontext'].apply(lambda x: x is not None and x != [])]
|
667 |
+
# # Filter the DataFrame to get rows where 'ALLURIScontext' is not None, not an empty list, and not an empty string
|
668 |
+
valid_entries = df[df['ALLURIScontext'].apply(lambda x: x is not None and x != [] and (isinstance(x, list) and len(x) > 0) and (isinstance(x, list) and (not (len(x) == 1 and not str(x[0]).strip())) ))]
|
669 |
+
|
670 |
+
# Check if the selected word is in the filtered DataFrame
|
671 |
+
if selected_word in valid_entries['word'].values:
|
672 |
+
urls = valid_entries.loc[valid_entries['word'] == selected_word, 'ALLURIScontext'].values[0]
|
673 |
+
if 'namedEntity' in df.columns:
|
674 |
+
firsturlinlist = df.loc[df['word'] == selected_word, 'namedEntity']
|
675 |
+
firsturlinlist = firsturlinlist.iloc[0] if not firsturlinlist.empty else None
|
676 |
+
if firsturlinlist and firsturlinlist in urls:
|
677 |
+
# Remove the URL from its current position
|
678 |
+
urls.remove(firsturlinlist)
|
679 |
+
# Insert the URL at the first position
|
680 |
+
urls.insert(0, firsturlinlist)
|
681 |
+
|
682 |
+
# Convert list of URLs to HTML string with clickable links
|
683 |
+
html_links = "<br>".join([f'<a href="https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={url}" target="_blank">{url}</a>' for url in urls])
|
684 |
+
#html_links = "<br>".join([f'<a href="https://api-vast.jrc.service.ec.europa.eu/describe//?url={url}" target="_blank">{url}</a>' for url in urls])
|
685 |
+
return html_links
|
686 |
+
return ""
|
687 |
+
|
688 |
+
else:
|
689 |
+
return ""
|
690 |
+
|
691 |
+
|
692 |
+
# demo = gr.Interface(
|
693 |
+
# fn=nerBio,
|
694 |
+
# inputs=[
|
695 |
+
# gr.Textbox(label= "Input text", placeholder="Enter text here..."),
|
696 |
+
# gr.CheckboxGroup(models_List, label="ModelsSelection", value=models_List),
|
697 |
+
# gr.CheckboxGroup(categories_List, label="CategoriesSelection", value=categories_List),
|
698 |
+
# gr.Slider(minimum=0, maximum=1.0, step=0.1, label="Score", value=0.7),
|
699 |
+
# gr.Checkbox(label="Enable Named-Entity Linking (NEL)", value=False), #True False
|
700 |
+
# #gr.CheckboxGroup(POSSIBLE_KGchoices_List, label="KGchoices Selection", value=POSSIBLE_KGchoices_List, visible=True),
|
701 |
+
# gr.Dropdown(POSSIBLE_KGchoices_List, multiselect=True, label="KGchoices Selection", value=POSSIBLE_KGchoices_List),
|
702 |
+
# gr.State(value={})
|
703 |
+
# ],
|
704 |
+
# outputs=[
|
705 |
+
# gr.HighlightedText(label="Annotated Text"),
|
706 |
+
# gr.HTML(label="Linked Text", show_label=True, visible=True), # use gr.HTML to render the annotated text with links , visible
|
707 |
+
# gr.State(),
|
708 |
+
# gr.Dropdown(label="Annotated Concepts", interactive=True,visible=True),
|
709 |
+
# gr.Textbox(label="Linked Entities",interactive=False,visible=True)
|
710 |
+
# ],
|
711 |
+
# live=True,
|
712 |
+
# title="BioAnnotator: Biomedical Named-Entity Recognition (NER) and Linking (NEL)",
|
713 |
+
# description="""Interoperability – the capability of systems and organisations to cooperate across functional, sectoral and physical borders – is key for successful digital transformation.
|
714 |
+
# The [Interoperable Europe Act](https://interoperable-europe.ec.europa.eu/interoperable-europe/interoperable-europe-act) is an EU regulation that aims to strengthen public sector interoperability and will serve as a main EC policy framework for the years to come.
|
715 |
+
# Data exchange is vital for digital government policies, and semantic interoperability ensures systems understand each other despite different legacies and architectures.
|
716 |
+
#
|
717 |
+
# In this demo we show in particular the *BioAnnotator*, a prototype tool performing Biomedical Named-Entity Recognition (NER) and Linking (NEL). To give it a try, please select one or more NER models and enter some text to get it processed. Please select also the entity categories you want to extract, as well as the score to use as a threshold for the NER extraction. Finally, select whether you want to perform Named-Entity Linking (NEL) and if you want to enable the filtering to some specific biomedical ontologies only (acronyms description at: https://bioportal.bioontology.org/ontologies). See also: [InventoryHealthKGs.pdf](https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/ETOHA/KGs/InventoryHealthKGs.pdf).
|
718 |
+
# """,
|
719 |
+
# examples=examples,
|
720 |
+
# cache_examples=False,
|
721 |
+
# article="""
|
722 |
+
# **Categories Legend:**
|
723 |
+
# - MED | Medical
|
724 |
+
# - LOC | Locations
|
725 |
+
# - PER | Persons
|
726 |
+
# - ORG | Organizations
|
727 |
+
# - MISC | Miscellanea
|
728 |
+
# - CONC | Concepts & Ideas
|
729 |
+
# - BIOP | Biological
|
730 |
+
# - ACTI | Activities & Behaviors
|
731 |
+
# - ANAT | Anatomy
|
732 |
+
# - CHEM | Chemicals & Drugs
|
733 |
+
# - DEVI | Devices
|
734 |
+
# - DISO | Disorders
|
735 |
+
# - GENE | Genes & Molecular Sequences
|
736 |
+
# - GEOG | Geographic Areas
|
737 |
+
# - LIVB | Living Beings
|
738 |
+
# - OBJC | Objects
|
739 |
+
# - OCCU | Occupations
|
740 |
+
# - ORGA | Organizations
|
741 |
+
# - PHEN | Phenomena
|
742 |
+
# - PHYS | Physiology
|
743 |
+
# - PROC | Procedures
|
744 |
+
# """
|
745 |
+
# )
|
746 |
+
|
747 |
+
|
748 |
+
# Define the Gradio interface using Blocks
|
749 |
+
#description="This application performs biomedical named-entity recognition and linking."
|
750 |
+
with gr.Blocks(title="BioAnnotator") as demo:
|
751 |
+
|
752 |
+
gr.Markdown("# BioAnnotator: Biomedical Named-Entity Recognition (NER) and Linking (NEL)")
|
753 |
+
gr.Markdown("""
|
754 |
+
This application performs biomedical named-entity recognition and linking.
|
755 |
+
|
756 |
+
**Description:**
|
757 |
+
*Interoperability* – the capability of systems and organisations to cooperate across functional, sectoral and physical borders – is key for successful digital transformation.
|
758 |
+
The [Interoperable Europe Act](https://interoperable-europe.ec.europa.eu/interoperable-europe/interoperable-europe-act) is an EU regulation that aims to strengthen public sector interoperability and will serve as a main EC policy framework for the years to come.
|
759 |
+
Data exchange is vital for digital government policies, and semantic interoperability ensures systems understand each other despite different legacies and architectures.
|
760 |
+
|
761 |
+
In this demo we show in particular the *BioAnnotator*, a prototype tool performing Biomedical Named-Entity Recognition (NER) and Linking (NEL). To give it a try, please select one or more NER models and enter some text to get it processed. Please select also the entity categories you want to extract, as well as the score to use as a threshold for the NER extraction. Finally, select whether you want to perform Named-Entity Linking (NEL) and if you want to enable the filtering to some specific biomedical ontologies only (acronyms description at: https://bioportal.bioontology.org/ontologies). See also: [InventoryHealthKGs.pdf](https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/ETOHA/KGs/InventoryHealthKGs.pdf).
|
762 |
+
""")
|
763 |
+
|
764 |
+
|
765 |
+
with gr.Row():
|
766 |
+
with gr.Column():
|
767 |
+
text_input = gr.Textbox(label="Input text", placeholder="Enter text here...")
|
768 |
+
models_selection = gr.CheckboxGroup(models_List, label="ModelsSelection", value=models_List)
|
769 |
+
categories_selection = gr.CheckboxGroup(categories_List, label="CategoriesSelection", value=categories_List)
|
770 |
+
score_slider = gr.Slider(minimum=0, maximum=1.0, step=0.1, label="Score", value=0.7)
|
771 |
+
nel_checkbox = gr.Checkbox(label="Enable Named-Entity Linking (NEL)", value=False)
|
772 |
+
kgchoices_selection = gr.Dropdown(POSSIBLE_KGchoices_List, multiselect=True, label="KGchoices Selection", value=POSSIBLE_KGchoices_List)
|
773 |
+
state = gr.State(value={})
|
774 |
+
|
775 |
+
with gr.Column():
|
776 |
+
annotated_text = gr.HighlightedText(label="Annotated Text")
|
777 |
+
linked_text = gr.HTML(label="Linked Text", show_label=True, visible=True)
|
778 |
+
word_dropdown = gr.Dropdown(label="Annotated Concepts", show_label=True, visible=True, interactive=True)
|
779 |
+
urls_html = gr.HTML(label="Linked Entities", show_label=True, visible=True)
|
780 |
+
|
781 |
+
## Define the interactions
|
782 |
+
#text_input.change(fn=nerBio, inputs=[text_input, models_selection, categories_selection, score_slider, nel_checkbox, kgchoices_selection, state], outputs=[annotated_text, linked_text, state, word_dropdown, urls_html])
|
783 |
+
# Define the interactions for all inputs
|
784 |
+
inputs = [text_input, models_selection, categories_selection, score_slider, nel_checkbox, kgchoices_selection]
|
785 |
+
for input_component in inputs:
|
786 |
+
input_component.change(fn=nerBio,
|
787 |
+
inputs=[text_input, models_selection, categories_selection, score_slider, nel_checkbox,
|
788 |
+
kgchoices_selection, state],
|
789 |
+
outputs=[annotated_text, linked_text, state, word_dropdown, urls_html])
|
790 |
+
|
791 |
+
word_dropdown.change(fn=update_urls, inputs=[word_dropdown, state], outputs=urls_html)
|
792 |
+
|
793 |
+
# Add examples
|
794 |
+
gr.Examples(examples=examples,
|
795 |
+
inputs=[text_input, models_selection, categories_selection, score_slider, nel_checkbox,
|
796 |
+
kgchoices_selection])
|
797 |
+
|
798 |
+
gr.Markdown("""
|
799 |
+
**Categories Legend:**
|
800 |
+
- MED | Medical
|
801 |
+
- LOC | Locations
|
802 |
+
- PER | Persons
|
803 |
+
- ORG | Organizations
|
804 |
+
- MISC | Miscellanea
|
805 |
+
- CONC | Concepts & Ideas
|
806 |
+
- BIOP | Biological
|
807 |
+
- ACTI | Activities & Behaviors
|
808 |
+
- ANAT | Anatomy
|
809 |
+
- CHEM | Chemicals & Drugs
|
810 |
+
- DEVI | Devices
|
811 |
+
- DISO | Disorders
|
812 |
+
- GENE | Genes & Molecular Sequences
|
813 |
+
- GEOG | Geographic Areas
|
814 |
+
- LIVB | Living Beings
|
815 |
+
- OBJC | Objects
|
816 |
+
- OCCU | Occupations
|
817 |
+
- ORGA | Organizations
|
818 |
+
- PHEN | Phenomena
|
819 |
+
- PHYS | Physiology
|
820 |
+
- PROC | Procedures
|
821 |
+
""")
|
822 |
+
|
823 |
+
|
824 |
+
|
825 |
+
demo.launch()
|
826 |
+
#demo.launch(share=True) # Share your demo with just 1 extra parameter
|
common.py
CHANGED
@@ -5,11 +5,12 @@ import numpy as np
|
|
5 |
import tiktoken
|
6 |
from langchain.text_splitter import TokenTextSplitter
|
7 |
|
|
|
8 |
def strip_quotes(text):
|
9 |
if text.startswith('"') and text.endswith('"'):
|
10 |
return text[1:-1]
|
11 |
return text
|
12 |
-
|
13 |
def strtobool(val):
|
14 |
val = val.lower()
|
15 |
if val in ('yes', 'true', 't', '1'):
|
|
|
5 |
import tiktoken
|
6 |
from langchain.text_splitter import TokenTextSplitter
|
7 |
|
8 |
+
# Function to cleanly strip quoted strings
|
9 |
def strip_quotes(text):
|
10 |
if text.startswith('"') and text.endswith('"'):
|
11 |
return text[1:-1]
|
12 |
return text
|
13 |
+
|
14 |
def strtobool(val):
|
15 |
val = val.lower()
|
16 |
if val in ('yes', 'true', 't', '1'):
|
llmqueryNer.py
ADDED
@@ -0,0 +1,881 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
|
4 |
+
import openai
|
5 |
+
import json
|
6 |
+
import time
|
7 |
+
from tqdm import tqdm
|
8 |
+
|
9 |
+
import logging
|
10 |
+
|
11 |
+
from functools import partial
|
12 |
+
import pandas as pd
|
13 |
+
|
14 |
+
import tiktoken
|
15 |
+
from langchain.text_splitter import TokenTextSplitter
|
16 |
+
|
17 |
+
import argparse
|
18 |
+
|
19 |
+
from common import cleanInputText, encoding_getter, tokenizer, token_counter
|
20 |
+
|
21 |
+
#from llmqueryHF import api_call_HFonPremises
|
22 |
+
|
23 |
+
#from dgl_client.api_cli import APIClient, InferenceClient
|
24 |
+
|
25 |
+
#DGL_API_ENDPOINT = "https://www.diglife.eu/inference"
|
26 |
+
#client_dglc = InferenceClient(backend_url=DGL_API_ENDPOINT)
|
27 |
+
|
28 |
+
fkeyname = "GPTJRC-APItoken.key"
|
29 |
+
key_gptjrc=""
|
30 |
+
if os.path.exists(fkeyname):
|
31 |
+
with open(fkeyname) as f:
|
32 |
+
key_gptjrc = f.read()
|
33 |
+
else:
|
34 |
+
key_gptjrc = os.environ['key_gptjrc']
|
35 |
+
clientGPTJRC = openai.OpenAI(api_key=key_gptjrc, base_url="https://api-gpt.jrc.ec.europa.eu/v1")
|
36 |
+
|
37 |
+
|
38 |
+
"""
|
39 |
+
query LLM API end point on list of text, seamlessly
|
40 |
+
|
41 |
+
features:
|
42 |
+
- build in retry in case of error
|
43 |
+
- cache the results in case of crash
|
44 |
+
- call LLM with a lambda or as regular function call
|
45 |
+
|
46 |
+
supported API:
|
47 |
+
- OpenAI
|
48 |
+
- GPT@JRC
|
49 |
+
- F7 (DigLife)
|
50 |
+
|
51 |
+
issues:
|
52 |
+
- the cache is written after each succesfull call, could results in slowdown for large dataset
|
53 |
+
- for the moment deals only with openai's rate limit error, all other error will result in crash
|
54 |
+
"""
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
|
63 |
+
|
64 |
+
|
65 |
+
# ### OPENAI API
|
66 |
+
|
67 |
+
def setup_openai(org=None, key=None):
|
68 |
+
if org is not None:
|
69 |
+
openai.organization = org
|
70 |
+
# else:
|
71 |
+
# openai.organization = os.getenv("OPENAI_API_ORGANIZATION")
|
72 |
+
|
73 |
+
if key is not None:
|
74 |
+
openai.api_key = key
|
75 |
+
else:
|
76 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
77 |
+
#
|
78 |
+
print(model_list_openai())
|
79 |
+
|
80 |
+
def api_call_openai(prompt: str, input_text: str, model: str, temperature: int, timeout_retry: int=5, delimiter: str = "```", InContextExamples: list[[str]] = [], debug=False, args: argparse.Namespace=None):
|
81 |
+
""" call openai API, with a retry in case of RateLimitError """
|
82 |
+
|
83 |
+
if not(prompt) or prompt.strip=="" or not(input_text) or input_text.strip=="":
|
84 |
+
logging.warning("No text or promt supplied! Skypping it!")
|
85 |
+
return None
|
86 |
+
|
87 |
+
if delimiter and len(delimiter)>0:
|
88 |
+
input_text = delimiter + input_text + delimiter
|
89 |
+
|
90 |
+
response = None
|
91 |
+
|
92 |
+
myMessages = []
|
93 |
+
if InContextExamples:
|
94 |
+
for row in InContextExamples:
|
95 |
+
myMessages.append({"role": "system", "content": prompt})
|
96 |
+
for indCol, colVal in enumerate(row):
|
97 |
+
if indCol == 0:
|
98 |
+
if delimiter and len(delimiter) > 0:
|
99 |
+
myMessages.append({"role": "user", "content": (delimiter + colVal + delimiter)})
|
100 |
+
else:
|
101 |
+
myMessages.append({"role": "user", "content": colVal})
|
102 |
+
elif indCol == 1:
|
103 |
+
myMessages.append({"role": "assistant", "content": colVal})
|
104 |
+
|
105 |
+
myMessages.append({"role": "system", "content": prompt})
|
106 |
+
myMessages.append({'role': 'user', 'content': input_text})
|
107 |
+
|
108 |
+
max_retries = 50
|
109 |
+
iteration = 1
|
110 |
+
while response is None and max_retries > 0:
|
111 |
+
try:
|
112 |
+
response = openai.ChatCompletion.create(
|
113 |
+
model=model,
|
114 |
+
# messages=[
|
115 |
+
# {"role": "system", "content": prompt},
|
116 |
+
# {'role': 'user', 'content': input_text},
|
117 |
+
# ],
|
118 |
+
messages=myMessages,
|
119 |
+
temperature=temperature,
|
120 |
+
#max_tokens=32000, #it gives error
|
121 |
+
#max_response_tokens=32000 #it gives error
|
122 |
+
)
|
123 |
+
except openai.RateLimitError as e:
|
124 |
+
response = None
|
125 |
+
max_retries = max_retries - 1
|
126 |
+
print(e)
|
127 |
+
nt = token_counter((prompt + input_text), model)
|
128 |
+
print("Model "+str(model)+" - Length of overall prompt message ", str(nt))
|
129 |
+
print("current iteration ", iteration)
|
130 |
+
print("try other ", max_retries, " times")
|
131 |
+
print("sleeping", int(iteration * timeout_retry), "s")
|
132 |
+
print(time.sleep(int(iteration * timeout_retry)))
|
133 |
+
iteration = iteration + 1
|
134 |
+
except Exception as err:
|
135 |
+
response = None
|
136 |
+
max_retries = max_retries - 1
|
137 |
+
print(err)
|
138 |
+
nt = token_counter((prompt + input_text), model)
|
139 |
+
print("Model " + str(model) + " - Length of overall prompt message ", str(nt))
|
140 |
+
print("current iteration ", iteration)
|
141 |
+
print("try other ", max_retries, " times")
|
142 |
+
print("sleeping", int(iteration*timeout_retry), "s")
|
143 |
+
print(time.sleep(int(iteration*timeout_retry)))
|
144 |
+
iteration = iteration + 1
|
145 |
+
|
146 |
+
if (response == None) and (max_retries <= 0):
|
147 |
+
print("\n")
|
148 |
+
print(prompt + input_text)
|
149 |
+
print("\n")
|
150 |
+
print("\nTried many times and did not succeed, there is something strange. Check the problem...exiting now\n")
|
151 |
+
sys.exit()
|
152 |
+
|
153 |
+
return response
|
154 |
+
|
155 |
+
def model_list_openai():
|
156 |
+
return openai.Model.list()
|
157 |
+
|
158 |
+
|
159 |
+
### GPT@JRC API
|
160 |
+
|
161 |
+
def setup_gptjrc_formerOpenAI(token=None):
|
162 |
+
if token is None:
|
163 |
+
token=os.getenv("GPTJRC_TOKEN")
|
164 |
+
openai.organization = ""
|
165 |
+
openai.api_key = token
|
166 |
+
#openai.api_type = "open_ai"
|
167 |
+
openai.api_base = "https://api-gpt.jrc.ec.europa.eu/v1"
|
168 |
+
#
|
169 |
+
print(model_list_gptjrc())
|
170 |
+
|
171 |
+
|
172 |
+
def setup_gptjrc(token=None):
|
173 |
+
# if token is None:
|
174 |
+
# #token=os.getenv("GPTJRC_TOKEN")
|
175 |
+
# token = os.getenv("OPENAI_API_KEY")
|
176 |
+
#
|
177 |
+
# clientGPTJRC = openai.OpenAI(api_key=token, base_url = "https://api-gpt.jrc.ec.europa.eu/v1")
|
178 |
+
|
179 |
+
all_models = clientGPTJRC.models.list()
|
180 |
+
# for model in all_models:
|
181 |
+
# print(model.id)
|
182 |
+
|
183 |
+
chat_models = [model for model in all_models.data if model.model_usage == "chat"]
|
184 |
+
print(f"\nGPTJRC - Found {len(chat_models)} chat models:")
|
185 |
+
for model in chat_models:
|
186 |
+
print(" " + str(model.id))
|
187 |
+
embed_models = [model for model in all_models.data if model.model_usage != "chat"]
|
188 |
+
|
189 |
+
print(f"\nGPTJRC - Found {len(chat_models)} embedding models:")
|
190 |
+
for model in embed_models:
|
191 |
+
print(" " + str(model.id))
|
192 |
+
|
193 |
+
|
194 |
+
def api_call_gptjrc(prompt: str, input_text: str, model: str, temperature: int, timeout_retry: int=5, delimiter: str = "```", InContextExamples: list[[str]] = [], debug=False, args: argparse.Namespace=None):
|
195 |
+
|
196 |
+
|
197 |
+
if not (prompt) or prompt.strip=="" or not(input_text) or input_text.strip=="":
|
198 |
+
logging.warning("No text or promt supplied! Skypping it!")
|
199 |
+
return None
|
200 |
+
|
201 |
+
if delimiter and len(delimiter)>0:
|
202 |
+
input_text = delimiter + input_text + delimiter
|
203 |
+
|
204 |
+
response = None
|
205 |
+
|
206 |
+
myMessages = []
|
207 |
+
if InContextExamples:
|
208 |
+
for row in InContextExamples:
|
209 |
+
myMessages.append({"role": "system", "content": prompt})
|
210 |
+
for indCol, colVal in enumerate(row):
|
211 |
+
if indCol == 0:
|
212 |
+
if delimiter and len(delimiter) > 0:
|
213 |
+
myMessages.append({"role": "user", "content": (delimiter + colVal + delimiter)})
|
214 |
+
else:
|
215 |
+
myMessages.append({"role": "user", "content": colVal})
|
216 |
+
elif indCol == 1:
|
217 |
+
myMessages.append({"role": "assistant", "content": colVal})
|
218 |
+
|
219 |
+
myMessages.append({"role": "system", "content": prompt})
|
220 |
+
myMessages.append({'role': 'user', 'content': input_text})
|
221 |
+
|
222 |
+
|
223 |
+
max_retries = 50
|
224 |
+
iteration = 1
|
225 |
+
while response is None and max_retries>0:
|
226 |
+
try:
|
227 |
+
# if InContextExamples:
|
228 |
+
# response = openai.ChatCompletion.create(
|
229 |
+
# headers={"Authorization": "Bearer " + openai.api_key},
|
230 |
+
# model=model,
|
231 |
+
# messages=[
|
232 |
+
# {"role": "system", "content": prompt},
|
233 |
+
# {'role': 'user', 'content': InContextExamples[0][0]},
|
234 |
+
# {'role': 'assistant', 'content': InContextExamples[0][1]},
|
235 |
+
# {"role": "system", "content": prompt},
|
236 |
+
# {'role': 'user', 'content': InContextExamples[1][0]},
|
237 |
+
# {'role': 'assistant', 'content': InContextExamples[1][1]},
|
238 |
+
# {"role": "system", "content": prompt},
|
239 |
+
# {'role': 'user', 'content': InContextExamples[2][0]},
|
240 |
+
# {'role': 'assistant', 'content': InContextExamples[2][1]},
|
241 |
+
# {"role": "system", "content": prompt},
|
242 |
+
# {'role': 'user', 'content': input_text},
|
243 |
+
# ],
|
244 |
+
# temperature=temperature,
|
245 |
+
# # max_tokens=4000, #20000, #32000, #it gives error
|
246 |
+
# # max_response_tokens=32000 #it gives error
|
247 |
+
# )
|
248 |
+
# else:
|
249 |
+
|
250 |
+
# former OpenAI call
|
251 |
+
# response = openai.ChatCompletion.create(
|
252 |
+
# headers={"Authorization": "Bearer "+openai.api_key},
|
253 |
+
# model=model,
|
254 |
+
# # messages=[
|
255 |
+
# # {"role": "system", "content": prompt},
|
256 |
+
# # {'role': 'user', 'content': input_text},
|
257 |
+
# # ],
|
258 |
+
# messages=myMessages,
|
259 |
+
# temperature=temperature,
|
260 |
+
# #max_tokens=4000, #20000, #32000, #it gives error
|
261 |
+
# #max_response_tokens=32000 #it gives error
|
262 |
+
# )
|
263 |
+
|
264 |
+
response = clientGPTJRC.chat.completions.create(
|
265 |
+
model=model,
|
266 |
+
stream=False,
|
267 |
+
# messages=[{"role": "user", "content": "Hello!"}],
|
268 |
+
messages=myMessages,
|
269 |
+
temperature=temperature,
|
270 |
+
)
|
271 |
+
|
272 |
+
#print(response.choices[0].message.content)
|
273 |
+
|
274 |
+
except openai.RateLimitError as e:
|
275 |
+
response = None
|
276 |
+
max_retries = max_retries - 1
|
277 |
+
print(e)
|
278 |
+
nt = token_counter((prompt + input_text), model)
|
279 |
+
print("Model " + str(model) + " - Length of overall prompt message ", str(nt))
|
280 |
+
print("current iteration ", iteration)
|
281 |
+
print("try other ", max_retries, " times")
|
282 |
+
print("sleeping", int(iteration*timeout_retry), "s")
|
283 |
+
print(time.sleep(int(iteration*timeout_retry)))
|
284 |
+
iteration = iteration + 1
|
285 |
+
print("\npromt:")
|
286 |
+
print(prompt)
|
287 |
+
print("\ninput_text:")
|
288 |
+
print(input_text)
|
289 |
+
if max_retries == 45 or max_retries == 40 or max_retries == 35 or max_retries == 30 or max_retries == 25 or max_retries == 20 or max_retries == 15 or max_retries == 10 or max_retries == 5:
|
290 |
+
# input_text = input_text[0:-1000]
|
291 |
+
# input_text = input_text + delimiter
|
292 |
+
#
|
293 |
+
input_text = cleanInputText(input_text)
|
294 |
+
#
|
295 |
+
ntokens = int(token_counter(input_text, model))
|
296 |
+
if ntokens > 1000: # I split the CONTEXT if it is TOO BIG, BIGGER THAN 1000 tokens let's say
|
297 |
+
encod = encoding_getter(model)
|
298 |
+
text_splitter = TokenTextSplitter(
|
299 |
+
# separators=separators,
|
300 |
+
encoding_name=encod.name,
|
301 |
+
chunk_size=int(0.8 * ntokens),
|
302 |
+
chunk_overlap=50,
|
303 |
+
length_function=len,
|
304 |
+
add_start_index=True,
|
305 |
+
)
|
306 |
+
texts = text_splitter.create_documents([input_text])
|
307 |
+
input_text = texts[0].page_content
|
308 |
+
myMessages = []
|
309 |
+
myMessages.append({"role": "system", "content": prompt})
|
310 |
+
myMessages.append({'role': 'user', 'content': input_text})
|
311 |
+
except Exception as err:
|
312 |
+
response = None
|
313 |
+
max_retries = max_retries - 1
|
314 |
+
print(err)
|
315 |
+
nt = token_counter((prompt + input_text), model)
|
316 |
+
print("Model " + str(model) + " - Length of overall prompt message ", str(nt))
|
317 |
+
print("current iteration ", iteration)
|
318 |
+
print("try other ", max_retries, " times")
|
319 |
+
print("sleeping", int(iteration * timeout_retry), "s")
|
320 |
+
print(time.sleep(int(iteration * timeout_retry)))
|
321 |
+
iteration = iteration + 1
|
322 |
+
print("\npromt:")
|
323 |
+
print(prompt)
|
324 |
+
print("\ninput_text:")
|
325 |
+
print(input_text)
|
326 |
+
if max_retries == 45 or max_retries == 40 or max_retries == 35 or max_retries == 30 or max_retries == 25 or max_retries == 20 or max_retries == 15 or max_retries == 10 or max_retries == 5:
|
327 |
+
# input_text = input_text[0:-1000]
|
328 |
+
# input_text = input_text + delimiter
|
329 |
+
#
|
330 |
+
input_text = cleanInputText(input_text)
|
331 |
+
#
|
332 |
+
ntokens = int(token_counter(input_text, model))
|
333 |
+
if ntokens > 1000: # I split the CONTEXT if it is TOO BIG, BIGGER THAN 1000 tokens let's say
|
334 |
+
encod = encoding_getter(model)
|
335 |
+
text_splitter = TokenTextSplitter(
|
336 |
+
# separators=separators,
|
337 |
+
encoding_name=encod.name,
|
338 |
+
chunk_size=int(0.8 * ntokens),
|
339 |
+
chunk_overlap=50,
|
340 |
+
length_function=len,
|
341 |
+
add_start_index=True,
|
342 |
+
)
|
343 |
+
texts = text_splitter.create_documents([input_text])
|
344 |
+
input_text = texts[0].page_content
|
345 |
+
myMessages = []
|
346 |
+
myMessages.append({"role": "system", "content": prompt})
|
347 |
+
myMessages.append({'role': 'user', 'content': input_text})
|
348 |
+
|
349 |
+
|
350 |
+
if (response == None) and (max_retries <= 0):
|
351 |
+
print("\n")
|
352 |
+
print(prompt + input_text)
|
353 |
+
print("\n")
|
354 |
+
print("\nTried many times and did not succeed, there is something strange. Check the problem...exiting now\n")
|
355 |
+
sys.exit()
|
356 |
+
|
357 |
+
return response
|
358 |
+
|
359 |
+
|
360 |
+
|
361 |
+
def model_list_gptjrc():
|
362 |
+
return openai.Model.list()
|
363 |
+
|
364 |
+
|
365 |
+
### DGLC API
|
366 |
+
|
367 |
+
def clean_gpt_out(output_text :str):
|
368 |
+
|
369 |
+
if "From the text below, delimited by triple quotes, extract the following items: 1 - The name of the virus that has caused the outbreak" in output_text:
|
370 |
+
print("debug")
|
371 |
+
|
372 |
+
if "<|assistant|>" in output_text:
|
373 |
+
output_text = output_text.split("<|assistant|>")[0].strip()
|
374 |
+
if "<|prompt|>" in output_text:
|
375 |
+
output_text = output_text.split("<|prompt|>")[0].strip()
|
376 |
+
if "<|prompter|>" in output_text:
|
377 |
+
output_text = output_text.split("<|prompter|>")[0].strip()
|
378 |
+
if "<|answer|>" in output_text:
|
379 |
+
output_text = output_text.split("<|answer|>")[0].strip()
|
380 |
+
if "<|im_end|>" in output_text:
|
381 |
+
output_text = output_text.split("<|im_end|>")[0].strip()
|
382 |
+
if "<|endofextract|>" in output_text:
|
383 |
+
output_text = output_text.split("<|endofextract|>")[0].strip()
|
384 |
+
if "<br>" in output_text:
|
385 |
+
output_text = output_text.split("<br>")[0].strip()
|
386 |
+
|
387 |
+
if "<|/assistant|>" in output_text:
|
388 |
+
output_text = output_text.split("<|/assistant|>")[0].strip()
|
389 |
+
if "<|/prompt|>" in output_text:
|
390 |
+
output_text = output_text.split("<|/prompt|>")[0].strip()
|
391 |
+
if "<|/prompter|>" in output_text:
|
392 |
+
output_text = output_text.split("<|/prompter|>")[0].strip()
|
393 |
+
if "<|/answer|>" in output_text:
|
394 |
+
output_text = output_text.split("<|/answer|>")[0].strip()
|
395 |
+
if "<|/im_end|>" in output_text:
|
396 |
+
output_text = output_text.split("<|/im_end|>")[0].strip()
|
397 |
+
if "<|/endofextract|>" in output_text:
|
398 |
+
output_text = output_text.split("<|/endofextract|>")[0].strip()
|
399 |
+
if "</br>" in output_text:
|
400 |
+
output_text = output_text.split("</br>")[0].strip()
|
401 |
+
|
402 |
+
if "</|assistant|>" in output_text:
|
403 |
+
output_text = output_text.split("</|assistant|>")[0].strip()
|
404 |
+
if "</|prompt|>" in output_text:
|
405 |
+
output_text = output_text.split("</|prompt|>")[0].strip()
|
406 |
+
if "</|prompter|>" in output_text:
|
407 |
+
output_text = output_text.split("</|prompter|>")[0].strip()
|
408 |
+
if "</|answer|>" in output_text:
|
409 |
+
output_text = output_text.split("</|answer|>")[0].strip()
|
410 |
+
if "</|im_end|>" in output_text:
|
411 |
+
output_text = output_text.split("</|im_end|>")[0].strip()
|
412 |
+
if "</|endofextract|>" in output_text:
|
413 |
+
output_text = output_text.split("</|endofextract|>")[0].strip()
|
414 |
+
|
415 |
+
while "```" in output_text:
|
416 |
+
output_text = output_text.replace("```", " ")
|
417 |
+
|
418 |
+
while " " in output_text:
|
419 |
+
output_text = output_text.replace(" ", " ")
|
420 |
+
|
421 |
+
return output_text
|
422 |
+
|
423 |
+
|
424 |
+
# def setup_dglc(key=None):
|
425 |
+
# if key is None:
|
426 |
+
# ACCESS_KEY = os.getenv("DGL_TOKEN")
|
427 |
+
# else:
|
428 |
+
# ACCESS_KEY=key
|
429 |
+
#
|
430 |
+
# client_dglc.login(ACCESS_KEY)
|
431 |
+
#
|
432 |
+
# #list available models
|
433 |
+
# models_available = model_list_dglc()
|
434 |
+
# print("DGLC - available models = "+str(models_available))
|
435 |
+
#
|
436 |
+
# # chat_id = client_dglc.create_chat()
|
437 |
+
# # # Or continue the previous one
|
438 |
+
# # # chat_id = client_dglc.continue_chat(args.chat_id)
|
439 |
+
# # print("\nCHAT_ID dglc", chat_id)
|
440 |
+
|
441 |
+
|
442 |
+
# def api_call_dglc(prompt: str, input_text: str, model: str, temperature: float, timeout_retry: int =5, delimiter: str = "```", InContextExamples: list[[str]] = [], debug=False, args: argparse.Namespace=None):
|
443 |
+
#
|
444 |
+
# # if model == "gpt-3.5-turbo":
|
445 |
+
# # model = "OA_GPT3.5"
|
446 |
+
#
|
447 |
+
#
|
448 |
+
# if not(input_text) or input_text.strip=="" or not(prompt) or prompt.strip=="":
|
449 |
+
# logging.warning("No text or promt supplied! Skypping it!")
|
450 |
+
# return None
|
451 |
+
#
|
452 |
+
# message = ""
|
453 |
+
# if InContextExamples:
|
454 |
+
# for row in InContextExamples:
|
455 |
+
# message = message + prompt
|
456 |
+
# for indCol, colVal in enumerate(row):
|
457 |
+
# if indCol == 0:
|
458 |
+
# if delimiter and len(delimiter) > 0:
|
459 |
+
# message = message + delimiter + colVal + delimiter
|
460 |
+
# else:
|
461 |
+
# message = message + colVal
|
462 |
+
# elif indCol == 1:
|
463 |
+
# message = message + " \n" + colVal + " \n"
|
464 |
+
#
|
465 |
+
# if delimiter and len(delimiter) > 0:
|
466 |
+
# message = prompt + delimiter + input_text + delimiter
|
467 |
+
# else:
|
468 |
+
# message = prompt + "\n" + input_text
|
469 |
+
#
|
470 |
+
# if debug:
|
471 |
+
# print("\n")
|
472 |
+
# print(message)
|
473 |
+
#
|
474 |
+
# chat_id = client_dglc.create_chat()
|
475 |
+
# # Or continue the previous one
|
476 |
+
# # chat_id = client.continue_chat(args.chat_id)
|
477 |
+
# if debug:
|
478 |
+
# print("\nCHAT_ID dglc", chat_id)
|
479 |
+
#
|
480 |
+
# try:
|
481 |
+
# events = client_dglc.send_message(message, model, temp=temperature)
|
482 |
+
# except Exception as err:
|
483 |
+
# logging.error(f'FAILED api_call_dglc WITH MESSAGE: \'{message}\' \nMODEL: {model}; \n\tError: {err}')
|
484 |
+
#
|
485 |
+
# events= None
|
486 |
+
# max_retries = 50
|
487 |
+
# iteration = 1
|
488 |
+
# while events is None and max_retries > 0:
|
489 |
+
# try:
|
490 |
+
# events = client_dglc.send_message(message, model, temp=temperature)
|
491 |
+
# except Exception as err:
|
492 |
+
# events = None
|
493 |
+
# max_retries = max_retries - 1
|
494 |
+
# print(err)
|
495 |
+
# nt = token_counter((message), model)
|
496 |
+
# print("Model " + str(model) + " - Length of overall prompt message ", str(nt))
|
497 |
+
# print("current iteration ", iteration)
|
498 |
+
# print("try other ", max_retries, " times")
|
499 |
+
# print("sleeping", int(iteration * timeout_retry), "s")
|
500 |
+
# print(time.sleep(int(iteration * timeout_retry)))
|
501 |
+
# iteration = iteration + 1
|
502 |
+
#
|
503 |
+
# if (events == None) and (max_retries <= 0):
|
504 |
+
# print("\n")
|
505 |
+
# print(message)
|
506 |
+
# print("\n")
|
507 |
+
# print("\nTried many times and did not succeed, there is something strange. Check the problem...exiting now\n")
|
508 |
+
# sys.exit()
|
509 |
+
#
|
510 |
+
# if events:
|
511 |
+
# event = [str(x) for x in events]
|
512 |
+
# # The message is streamed token by token
|
513 |
+
# # for event in events:
|
514 |
+
# # print(event, end="", flush=True)
|
515 |
+
# if event:
|
516 |
+
# event = event[-1]
|
517 |
+
# else:
|
518 |
+
# event = None
|
519 |
+
#
|
520 |
+
# if debug:
|
521 |
+
# print("\nAPI CALL ANSWER:")
|
522 |
+
# print(event)
|
523 |
+
# print("\n")
|
524 |
+
#
|
525 |
+
# else:
|
526 |
+
# event = None
|
527 |
+
#
|
528 |
+
# return event
|
529 |
+
|
530 |
+
# def model_list_dglc():
|
531 |
+
# return client_dglc.get_available_models()
|
532 |
+
|
533 |
+
|
534 |
+
### CALLING MODELS
|
535 |
+
|
536 |
+
|
537 |
+
def call_model_with_caching(input_text: str, prompt: str, model: str, temperature: int, handler,
|
538 |
+
map_query_input_output: dict, cache_fp: str, timeout_retry: int =5, delimiter: str = "```", InContextExamples: list[[str]] = [], verbose: bool = True, args: argparse.Namespace=None):
|
539 |
+
""" call openai's API but take care of caching of results
|
540 |
+
input_text: input text
|
541 |
+
prompt: prompt
|
542 |
+
model: model name (as parameter of the query)
|
543 |
+
temperature: temperature (0: precise, 1: creative)
|
544 |
+
handler: delegate function that will make the call (not necessarily only OpenAI, could be any one)
|
545 |
+
map_query_input_output: cache dict containing already processed data
|
546 |
+
cache_fp: file to which write content of cache after each call
|
547 |
+
"""
|
548 |
+
|
549 |
+
if not(input_text) or input_text.strip=="" or not(prompt) or prompt.strip=="":
|
550 |
+
logging.warning("No text or promt supplied! Skypping it!")
|
551 |
+
return None
|
552 |
+
|
553 |
+
# try to read cache
|
554 |
+
|
555 |
+
if map_query_input_output is not None:
|
556 |
+
key = model + "__" + str(temperature) + "__" + prompt
|
557 |
+
|
558 |
+
if key in map_query_input_output:
|
559 |
+
if input_text in map_query_input_output[key]:
|
560 |
+
output = map_query_input_output[key][input_text]
|
561 |
+
# if input_text.strip() == "":
|
562 |
+
# print("here")
|
563 |
+
|
564 |
+
# if handler == api_call_dglc:
|
565 |
+
# output = clean_gpt_out(output) #clean output
|
566 |
+
|
567 |
+
if verbose:
|
568 |
+
print("RETRIEVED CACHED RESULT FOR:\n", prompt, "\n", delimiter, input_text, delimiter, "\n=>\n", output, "\n")
|
569 |
+
|
570 |
+
return output
|
571 |
+
|
572 |
+
# call
|
573 |
+
|
574 |
+
response = None
|
575 |
+
|
576 |
+
try:
|
577 |
+
response = handler(prompt, input_text, model, temperature, timeout_retry, delimiter, InContextExamples, args=args)
|
578 |
+
except Exception as err:
|
579 |
+
logging.error(f'FAILED WITH PROMPT: \'{prompt}\' \nLEN_TEXT: {len(input_text)}, \nTEXT: {(input_text)}, \nMODEL: {model}; \n\tError: {err}')
|
580 |
+
#else:
|
581 |
+
# # logging.warning(f'INDEX: \'{SOURCE_INDEX}\' Inserted {inserted} rows out of {num_lines} rows [{round((inserted/num_lines)*100, 2)}%]')
|
582 |
+
# break
|
583 |
+
|
584 |
+
if response:
|
585 |
+
if isinstance(response, str):
|
586 |
+
output_text = response
|
587 |
+
else:
|
588 |
+
#output_text = response['choices'][0]['message']['content']
|
589 |
+
output_text = response.choices[0].message.content
|
590 |
+
|
591 |
+
# if handler == api_call_dglc:
|
592 |
+
# output_text = clean_gpt_out(output_text) # clean output
|
593 |
+
|
594 |
+
|
595 |
+
# write to cache
|
596 |
+
|
597 |
+
if map_query_input_output is not None:
|
598 |
+
if not key in map_query_input_output:
|
599 |
+
map_query_input_output[key] = {}
|
600 |
+
|
601 |
+
if output_text:
|
602 |
+
if output_text != "":
|
603 |
+
map_query_input_output[key][input_text] = output_text
|
604 |
+
|
605 |
+
with open(cache_fp, "w") as f:
|
606 |
+
json.dump(map_query_input_output, f)
|
607 |
+
|
608 |
+
if verbose:
|
609 |
+
print("API CALL REPLY FOR:\n", prompt, "\n", delimiter, input_text, delimiter, "\n=>\n", output_text, "\n")
|
610 |
+
|
611 |
+
return output_text
|
612 |
+
|
613 |
+
else:
|
614 |
+
return None
|
615 |
+
|
616 |
+
|
617 |
+
def call_model(input_text: str, prompt: str, model: str, temperature: int, handler, timeout_retry: int =5, delimiter: str = "```", InContextExamples: list[[str]] = [],
|
618 |
+
verbose: bool = True, args: argparse.Namespace=None):
|
619 |
+
""" call openai's API but take care of caching of resuts
|
620 |
+
input_text: input text
|
621 |
+
prompt: prompt
|
622 |
+
model: model name (as parameter of the query)
|
623 |
+
temperature: temperature (0: precise, 1: creative)
|
624 |
+
handler: delegate function that will make the call (not necessarily only OpenAI, could be any one)
|
625 |
+
"""
|
626 |
+
|
627 |
+
|
628 |
+
if not(input_text) or input_text.strip=="" or not(prompt) or prompt.strip=="":
|
629 |
+
logging.warning("No text or promt supplied! Skypping it!")
|
630 |
+
return None
|
631 |
+
|
632 |
+
return call_model_with_caching(input_text, prompt, model, temperature, handler, None, None, timeout_retry, delimiter, InContextExamples, verbose, args=args)
|
633 |
+
|
634 |
+
|
635 |
+
|
636 |
+
def process_list(list_input_text: list[str], prompt: str, service_provider: str, model: str, temperature: int,
|
637 |
+
cache_prefix_fp: str = None, delimiter: str = "```", InContextExamples: list[[str]] = [], args: argparse.Namespace=None):
|
638 |
+
""" process a list of text with a prompt and a model
|
639 |
+
list_input_text: list input text
|
640 |
+
prompt: prompt
|
641 |
+
service provide: either "openai" or "dglc" for the moment
|
642 |
+
model: model name (as parameter of the query)
|
643 |
+
temperature: temperature (0: precise, 1: creative)
|
644 |
+
cache_prefix_fp: prefix of the file to which write content of cache after each call
|
645 |
+
"""
|
646 |
+
|
647 |
+
if cache_prefix_fp is not None:
|
648 |
+
cache_fp = cache_prefix_fp + "___" + "__".join([service_provider, model, str(temperature)]).replace(" ", "_") + ".json"
|
649 |
+
|
650 |
+
if os.path.exists(cache_fp):
|
651 |
+
with open(cache_fp) as f:
|
652 |
+
map_query_input_output = json.load(f)
|
653 |
+
else:
|
654 |
+
map_query_input_output = {}
|
655 |
+
else:
|
656 |
+
map_query_input_output = None
|
657 |
+
cache_fp = None
|
658 |
+
|
659 |
+
handler = None
|
660 |
+
#if service_provider.lower() == "dglc": handler = api_call_dglc
|
661 |
+
if service_provider.lower() == "openai": handler = api_call_openai
|
662 |
+
if service_provider.lower() == "gptjrc": handler = api_call_gptjrc
|
663 |
+
#if service_provider.lower() == "hfonpremises": handler = api_call_HFonPremises
|
664 |
+
|
665 |
+
list_output_text = []
|
666 |
+
for input_text in tqdm(list_input_text):
|
667 |
+
output_text = call_model_with_caching(input_text, prompt, model, temperature, handler, map_query_input_output,
|
668 |
+
cache_fp, delimiter=delimiter, InContextExamples=InContextExamples, args=args)
|
669 |
+
list_output_text.append(output_text)
|
670 |
+
|
671 |
+
return list_output_text
|
672 |
+
|
673 |
+
|
674 |
+
|
675 |
+
|
676 |
+
if __name__ == "__main__":
|
677 |
+
|
678 |
+
USE_CACHE = False #True #False
|
679 |
+
|
680 |
+
#service_provider = "openai"
|
681 |
+
#model_name = "gpt-3.5-turbo-16k"
|
682 |
+
#
|
683 |
+
#
|
684 |
+
#service_provider = "dglc"
|
685 |
+
# dglc available models: 'OA_SFT_Pythia_12B', 'JRC_RHLF_13B', 'OA_GPT3.5', 'OA_GPT3'
|
686 |
+
# model_name = "gpt-3.5-turbo" #OpenAI name
|
687 |
+
# model_name = 'JRC_RHLF_13B'
|
688 |
+
#model_name = "OA_SFT_Pythia_12B" #EleutherAI-pythia-12b
|
689 |
+
# model_name = "OA_GPT3"
|
690 |
+
# model_name = "GPT@JRC_4"
|
691 |
+
#
|
692 |
+
#
|
693 |
+
#service_provider = "gptjrc"
|
694 |
+
#model_name = "gpt-35-turbo-0613"
|
695 |
+
#model_name = "gpt-35-turbo-16k"
|
696 |
+
#model_name = "gpt-4-32k" #GPT-4 with a context length of 32,768 tokens - around 116000
|
697 |
+
service_provider = "HFonPremises"
|
698 |
+
|
699 |
+
#model_name = "llama-3.1-70b-instruct"
|
700 |
+
#model_name = "llama-3.1-70b-instruct"
|
701 |
+
|
702 |
+
#model_name="nous-hermes-2-mixtral-8x7b-dpo"
|
703 |
+
#model_name = "nous-hermes-2-mixtral-8x7b-dpo"
|
704 |
+
|
705 |
+
#model_name="llama-3.1-8b-instruct"
|
706 |
+
#model_name = "llama-3.1-8b-instruct"
|
707 |
+
model_name = "llama-3.1-70b-instruct"
|
708 |
+
|
709 |
+
# temperature: temperature_value (0: precise, 1: creative)
|
710 |
+
temperature_value = 0.01 # 0.1
|
711 |
+
|
712 |
+
##################################################################################################
|
713 |
+
|
714 |
+
#OpenAI ChatGPT API
|
715 |
+
if service_provider == "openai":
|
716 |
+
MyOpenAPIKey = ""
|
717 |
+
fkeyname="OpenAI-DigLifeAccount-APItoken.key"
|
718 |
+
if os.path.exists(fkeyname):
|
719 |
+
with open(fkeyname) as f:
|
720 |
+
MyOpenAPIKey = f.read()
|
721 |
+
else:
|
722 |
+
MyOpenAPIKey = os.environ['key_MyOpenAPI']
|
723 |
+
setup_openai(key=MyOpenAPIKey)
|
724 |
+
|
725 |
+
# # test api call
|
726 |
+
# r = api_call_openai("say hello world", "you will answer in Spanish", "gpt-3.5-turbo", 0)
|
727 |
+
# print(r)
|
728 |
+
|
729 |
+
# # test process list
|
730 |
+
# r = process_list(["hello world", "hello everybody"], "you will translate to Spanish", "openai", "gpt-3.5-turbo", 0)
|
731 |
+
# print(r)
|
732 |
+
|
733 |
+
# # process list with caching
|
734 |
+
# r = process_list(["hello world", "hello everybody"], "you will translate to Spanish", "openai", "gpt-3.5-turbo", 0, "UNITTEST")
|
735 |
+
# print(r)
|
736 |
+
|
737 |
+
#### GPT@JRC API
|
738 |
+
if service_provider == "gptjrc":
|
739 |
+
key_gptjrc = ""
|
740 |
+
fkeyname = "GPTJRC-APItoken.key"
|
741 |
+
if os.path.exists(fkeyname):
|
742 |
+
with open(fkeyname) as f:
|
743 |
+
key_gptjrc = f.read()
|
744 |
+
else:
|
745 |
+
key_gptjrc = os.environ['key_gptjrc']
|
746 |
+
|
747 |
+
os.environ['OPENAI_API_KEY'] = key_gptjrc
|
748 |
+
#setup_gptjrc(key_gptjrc)
|
749 |
+
setup_gptjrc()
|
750 |
+
|
751 |
+
|
752 |
+
#### DGLC API
|
753 |
+
if service_provider == "dglc":
|
754 |
+
key_dglc = ""
|
755 |
+
fkeyname = "DGLC-APItoken.key"
|
756 |
+
if os.path.exists(fkeyname):
|
757 |
+
with open(fkeyname) as f:
|
758 |
+
key_dglc = f.read()
|
759 |
+
else:
|
760 |
+
key_dglc = os.environ['key_dglc']
|
761 |
+
# setup_dglc(key=key_dglc)
|
762 |
+
|
763 |
+
# TEST OF DGLC API CALL
|
764 |
+
# input_text = "this morning a rabbit killed a hunter"
|
765 |
+
# print("\ntext = \n"+input_text)
|
766 |
+
# prompt = "please reformulate the text, add more details, the text should be between 200 and 500 characters:"
|
767 |
+
# print("\nquestion = \n" + prompt)
|
768 |
+
# model= "OA_SFT_Pythia_12B" #not available in OpenAI
|
769 |
+
# print("\nmodel = \n" + model)
|
770 |
+
# print("\n")
|
771 |
+
#
|
772 |
+
# r = api_call_dglc(prompt, input_text, model, 0.5)
|
773 |
+
# if r:
|
774 |
+
# print(r)
|
775 |
+
#
|
776 |
+
# print("\nDone!\n")
|
777 |
+
|
778 |
+
|
779 |
+
###########################################################################
|
780 |
+
|
781 |
+
|
782 |
+
if USE_CACHE:
|
783 |
+
# cache_prefix_fp: prefix of the file to which write content of cache after each call
|
784 |
+
cache_prefix_fp = "LLMQUERYTEST"
|
785 |
+
cache_name = cache_prefix_fp + "___" + "__".join([service_provider, model_name, str(temperature_value)]).replace(" ", "_") + ".json"
|
786 |
+
|
787 |
+
if os.path.exists(cache_name):
|
788 |
+
with open(cache_name) as f:
|
789 |
+
load_map_query_input_output = json.load(f)
|
790 |
+
else:
|
791 |
+
load_map_query_input_output = {}
|
792 |
+
|
793 |
+
myPromt = f"""
|
794 |
+
translate in Spanish the text below, delimited by triple \
|
795 |
+
Text:
|
796 |
+
"""
|
797 |
+
|
798 |
+
myDelimiter = "```"
|
799 |
+
|
800 |
+
###
|
801 |
+
|
802 |
+
#example for counting number of tokens of the overall prompt for the model
|
803 |
+
|
804 |
+
# input_text = "one, two, three, a step fortward Mary"
|
805 |
+
# overall_string = myPromt + myDelimiter + input_text + myDelimiter
|
806 |
+
# nt = token_counter(overall_string, model_name)
|
807 |
+
# print("\nNumber of Tokens in the example = "+str(nt))
|
808 |
+
|
809 |
+
encod = encoding_getter(model_name)
|
810 |
+
print("\nencodName = " + str(encod.name))
|
811 |
+
|
812 |
+
InContextExamples = []
|
813 |
+
# InContextExamples = [["ADENOVIRUS - USA (02): (NEW JERSEY) UPDATE A ProMED-mail post http://www.promedmail.org ProMED-mail is a program of the International Society for Infectious Diseases http://www.isid.org Date: Sun 28 Oct 2018 3:12 PM Source: CBS news [edited] https://www.cbsnews.com/news/adenovirus-outbreak-new-jersey-wanaque-center- facility-9th-child-dies-2018-10-28/ Another child has died amid a deadly virus outbreak at a New Jersey rehabilitation, bringing the total number of deaths to 9, officials said [Sun 28 Oct 2018]. The state\'s Department of Health said the victim was a \"medically fragile\" child who had a confirmed case of adenovirus. The unidentified child died Saturday evening [27 Oct 2018] at the Wanaque Center for Nursing and Rehabilitation in Haskell, the department said. There have been 25 cases associated with the outbreak. \"This is a tragic situation, and our thoughts are with the families who are grieving right now,\" Health Commissioner Dr Shereef Elnahal said in a statement. \"We are working every day to ensure all infection control protocols are continuously followed and closely monitoring the situation at the facility.\" Adenoviruses are a family of viruses that account for about 5 to 10 percent of fevers in young children, but most patients recover. The infections can affect the tissue linings of the respiratory tract, eyes, intestines, urinary tract and nervous system, causing illnesses ranging from a cold to bronchitis to pneumonia to pink eye. The children at Wanaque appear to have been more susceptible to serious infections due to their other medical conditions. Children at the center are severely disabled, with some living in comas, and for many, it is their permanent home, the Bergen Record reports. Many will never walk or talk, and some have spent virtually their whole lives there, according to the paper. . Communicated by: ProMED-mail Rapporteur Kunihiko Iizuka [Human adenoviruses (HAdVs) are non-enveloped, linear double-stranded DNA viruses encapsidated within a protein shell and have been categorized into 6 species (A-F) that contain 51 immunologically distinct serotypes (Fields virology. 5th ed. Philadelphia (PA): Lippincott-Raven; 2007. p. 2395-436). HAdVs most commonly cause acute respiratory disease; however, depending on the infecting HAdV serotype and tropism resulting from differential host receptor use, the wide variety of symptoms can include pneumonia, febrile upper respiratory illness, conjunctivitis, cystitis, and gastroenteritis (Principles and practice of infectious diseases. 5th ed. Vol 2. Philadelphia (PA): Churchill Livingstone; 2005. p. 1835-41). The severity of disease appears dependent on the immunocompetence and cardiopulmonary health of the host, and the spectrum of disease can range from subclinical to severe respiratory distress and death. Immunocompromised patients (such as bone marrow transplant recipients) are particularly susceptible to HAdV infection, resulting in severe illness and deaths, whereas illness in immunocompetent patients generally resolves without major complication. The outbreak report above involves young children that are in a healthcare facility and immunocompromised on account of underlying co- morbid conditions. Adenovirus associated morbidity and mortality in this setting would require intensive infection control measures. In immunocompromised patients, several drugs, such as cidofovir, ribavirin, ganciclovir, and vidarabine, have been used to treat adenovirus infections. Most of these agents are virostatic, may induce drug resistance, and have significant risks of toxicities, as well as risks to healthcare staff [e.g., aerosolized ribavirin - Mod.ML]. - Mod.UBA HealthMap/ProMED map available at: New Jersey, United States: https://promedmail.org/promed-post?place=6117463,232 ] See Also Adenovirus - USA: (NJ) children, fatal 20181025.6108470 .uba/mj/ml",
|
814 |
+
# "{\"virus\": \"ADENOVIRUS\", \"country\": \"USA\", \"date\": \"2018-10-28\", \"cases\": \"25\", \"deaths\": \"9\"}"],
|
815 |
+
# ["NIPAH VIRUS - INDIA (14): (KERALA) * A ProMED-mail post http://www.promedmail.org ProMED-mail is a program of the International Society for Infectious Diseases http://www.isid.org Date: Tue 3 Jul 2018 Source: MediBulletin [edited] https://medibulletin.com/2018/07/03/bats-indicted-in-kerala-nipah-outbreak- icmr-sends-paper-to-lancet/ Putting to rest suspense about the source of the Nipah virus infections in Kerala, scientists from the Indian Council of Medical Research have now found the virus in bats that were caught from the affected areas. At least 17 people died of Nipah infection in Mallapuram and Kozhikode districts of Kerala over April and May [2018]. While the 1st batch of bats caught from the well in Kozhikode in the house from where the 1st case was reported, had tested negative; of the 2nd batch of 52 fruit bats, 19.2 percent were found to carry the virus. The findings will be published in The Lancet. Health minister J P Nadda was informed about the findings in a meeting last week. In the meeting, scientists from ICMR and public health officials also told the minister that circumstances have now improved enough for the state to be declared Nipah free. The incubation period of Nipah is 5 to 14 days. The last case was in May [2018] and now that 2 incubation periods have elapsed without any fresh cases, the specter of the dreaded disease seems to be finally receding. . Communicated by: ProMED-mail <[email protected]> [It is good to learn that there have been no additional cases of Nipah virus infection in Kerala. As was mentioned earlier, it is not surprising that the bats taken from the well were negative for the virus. Giant fruit bats (flying foxes genus _Pteropus_), the reservoir of Nipah virus in Bangladesh and Malaysia, do not roost in wells. They roost in tree tops. The species sampled in the 2nd batch of bats was not mentioned, but were likely flying foxes. It is fortunate that virus positive bats were found in this 2nd sampling. As commented earlier, bats \"may only be infectious for a week or 2, and then they clear the virus and they\'re no longer infectious,\" said Jonathan Epstein, a veterinarian and epidemiologist at EcoHealth Alliance, New York, who has, for over a decade, studied Nipah outbreaks and the bats that cause them, in Malaysia, India and Bangladesh. \"That\'s why these outbreaks are relatively rare events, given the fact that these bats are so abundant and so common but very few of them are ever actually shedding virus at a given time.\" Epstein and others conducted an experimental study of _Pteropus_ bats in 2011 and found that the time window in which the bats are capable of passing on the infection to other animals or humans is quite small. In fact, the virus can\'t be found in experimentally infected bats after a few weeks. The few bats in an infected population that could be shedding the virus may be doing so in low quantities and for a short duration. \"Finding that bats don\'t have Nipah virus at the time of sampling certainly doesn\'t mean that it didn\'t come from those bats, particularly _P. medius_,\" Epstein said. \"The overwhelming abundance of evidence really shows that this bat is the reservoir for Nipah virus on the subcontinent in Bangladesh and in India.\" - Mod.TY Maps of India can be accessed at: http://www.mapsofindia.com/maps/india/india-political-map.htm HealthMap/ProMED map available at: Kerala State, India: https://promedmail.org/promed-post?place=5887151,308 ] See Also Nipah virus - India (13): (KL) 20180611.5851326 Nipah virus - India (12) 20180603.5836554 Nipah virus - India (11): (KL) 20180602.5835342 Nipah virus - India (10): (KL) 20180602.5833137 Nipah virus - India (09): (WB ex KL) susp. 20180530.5829184 Nipah virus - India (08): (KR ex KL) susp. 20180529.5826769 Nipah virus - India (07) 20180528.5822566 Nipah virus - India (06): (KL,KA) 20180526.5819777 Nipah virus - India (05): (KL,TG) 20180525.5817917 Nipah virus - India (04): (KL, KA) 20180524.5815473 Nipah virus - India (03): (KL) conf. 20180522.5812214 Nipah virus - India (02): (KL) conf 20180521.5809003 Nipah virus - India: (KL) susp 20180521.5807513 2007 . Nipah virus, fatal - India (West Bengal) (02) 20070511.1514 Nipah virus, fatal - India (West Bengal) 20070508.1484 Undiagnosed deaths - Bangladesh, India (04) 20070504.1451 .ty/ao/jh",
|
816 |
+
# "{\"virus\": \"Nipah virus\", \"country\": \"India\", \"date\": \"2018-07-03\", \"cases\": \"None\", \"deaths\": \"17\"}"],
|
817 |
+
# ["UNDIAGNOSED RESPIRATORY ILLNESS - USA: (NEW YORK), ex MIDDLE EAST, FLIGHT PASSENGERS AND CREW, REQUEST FOR INFORMATION * A ProMED-mail post http://www.promedmail.org ProMED-mail is a program of the International Society for Infectious Diseases http://www.isid.org [1] Date: Wed 5 Sep 2018 Source: Stat News [edited] https://www.statnews.com/2018/09/05/plane-quarantined-at-kennedy-airport-amid- reports-of-100-ill-passengers/ An Emirates Airline flight was held in quarantine for a period at New York\'s John F. Kennedy International Airport after a large number of passengers were reported feeling ill during the flight. The airline said in a statement that about 10 passengers on the flight from Dubai to New York had become ill on board. But the Centers for Disease Control and Prevention [CDC] said the estimated number was about 100. Passengers and some crew complained of illness including cough, according to the CDC; some had developed a fever. \"CDC public health officers are working with port authority, EMS, and CBP officials to evaluate passengers including taking temperatures and making arrangements for transport to local hospitals,\" the CDC said. \"Passengers who are not ill will be allowed to continue with their travel plans, and if necessary will be followed up with by health officials.\" A spokesman for New York Mayor Bill de Blasio said 19 passengers had taken ill - 10 were sent to a hospital and another 9 refused medical attention. There were approximately 521 passengers on the flight. A number of the passengers on the flight were returning from the Hajj, the annual mass pilgrimage to Mecca, in Saudi Arabia, a source told STAT. It was unclear if the people who were ill were the same passengers who had attended. Saudi Arabia has reported cases of MERS, Middle East respiratory syndrome, which passes to people from camels. But the fact that so many people became ill during the flight would make MERS seem an unlikely cause. [Byline: Helen Branswell] . Communicated by: Meghan May University of New England <[email protected]> [2] Date: Wed 5 Sep 2018 Source: BBC [edited] https://www.bbc.com/news/world-us-canada-45425412 A total of 19 people have been taken ill after an Emirates airline plane landed in New York, officials say. The plane was quarantined at JFK airport as those on board were checked by health officials. As many as 10 were taken to hospital but others refused treatment. The US Centers for Disease Control and Prevention (CDC) said that initially about 100 people including some crew had complained of illness. Flight 203 from Dubai landed at 09:10 (13.10 GMT) with 521 passengers. Emergency vehicles were seen on the runway as it landed. Soon afterwards, Emirates airline tweeted that the sick passengers were being attended to and those who were unaffected would be allowed to leave the plane. The CDC said in a statement that is was \"aware of an Emirates flight from Dubai that arrived this morning at JFK\". \"Approximately 100 passengers, including some crew on the flight, complained of illness including cough and some with fever. \"CDC public health officers are working with. officials to evaluate passengers including taking temperatures and making arrangements for transport to local hospitals those that need care.\" Later Eric Phillips, spokesman for New York Mayor Bill de Blasio, confirmed that all the passengers were off the plane and the sick people had been taken to hospital. He said that some of the passengers had originally come from the Saudi Arabian city of Mecca, which was currently experiencing a flu outbreak, and that the passengers\' symptoms were \"pointing to the flu\". . Communicated by: ProMED-mail <[email protected]> [3] Date: Wed 5 Sep 2018, 10:55 AM ET Source: NPR [edited] https://www.npr.org/2018/09/05/644826743/emirates-airline-says- about-10-passengers-fell-ill-on-flight-to-new-york Health and safety officials are investigating an illness that struck people on an Emirates Airline flight from Dubai, United Arab Emirates, to New York\'s John F. Kennedy International Airport on Wednesday morning. A total of 7 crew members and 3 passengers were taken to the hospital, Emirates Airline said. It added that [Wed 5 Sep 2018] return flight from New York to Dubai would leave 3 hours late. The Centers for Disease Control and Prevention said around 100 people on the overnight Flight 203 had complained of illness. For some, the symptoms were limited to a cough; others had a fever. An Emirates A380 in quarantine at JFK Airport right now awaiting CDC officials after about 100 passengers became ill with fevers over 100 degrees and coughing. Flight 203 had just arrived from Dubai. Ten people were taken off the plane for treatment at Jamaica Hospital Medical Center, said Eric Phillips, press secretary for New York City Mayor Bill de Blasio. He said 9 others were found to be sick but refused medical attention. The aircraft was carrying around 521 passengers. Health officials allowed people to disembark only after checking each one for symptoms, Phillips stated, adding, \"The plane\'s been quarantined and the CDC is on the scene.\" As for what the sickness might be, Phillips referred to a \"flu outbreak\" in Mecca, Saudi Arabia, and said that might be a possibility, stating, \"It appears some of the ill passengers came from Mecca before getting on in Dubai.\" [One] passenger called it the \"worst flight ever,\" saying on Twitter that the plane \"was basically a flying infirmary. Many of these people should never have been allowed to board.\" By around noon, 432 passengers had been cleared and allowed to go to the customs area, according to Phillips. A few others who showed symptoms were held for treatment and possible transport to the hospital. NPR\'s Rob Stein reports that the Centers for Disease Control and Prevention is working with local authorities. The Airbus A380 jet was isolated on the tarmac at JFK, as officials took stock of the situation. Images from the scene showed a row of ambulances alongside the aircraft. [Another] passenger said via Twitter that CDC staff came onto the plane and that everyone aboard was asked to fill out a form providing their contact information for the next 3 weeks. [Byline: Bill Chappell] . Communicated by: ProMED-mail <[email protected]> [Mention a plane load of individuals with febrile respiratory symptoms coming from the Middle East and immediate panic of \"could this be MERS-CoV?\" Equally or more serious than this is an outbreak of influenza. According to the media coverage, there is an ongoing outbreak of influenza in Mecca (Makkah) (Saudi Arabia, where Hajji\'s congregate) now, concurrent with the Hajj, and a rapid onset of respiratory symptoms is more likely to be influenza than it is MERS- CoV. We still do not have information as to the origins of the ill passengers to know if they were beginning their trips in Mecca and were returning Hajji\'s or if they began their travels elsewhere. Presumably the crew members began their working journeys in Dubai, but may have commuted in from elsewhere in or outside of the region (and there is mention of ill crew members). Returning to the possibility that this is related to the Hajj, a mass gathering of more than 2 million individuals from all over the world, making a religious pilgrimage to Mecca, it is interesting to review the Saudi Hajj/Umrah health requirements. While some vaccines are mandatory (meningitis vaccine, polio if coming from a country with ongoing poliovirus transmission, and yellow fever if coming from a known yellow fever endemic area, in contrast, influenza vaccine is recommended but not obligatory. \"Seasonal Influenza: \"The Saudi Ministry of Health recommends that international pilgrims be vaccinated against seasonal influenza before arrival into the Kingdom of Saudi Arabia, particularly those at increased risk of severe influenza diseases, including pregnant women, children under 5 years, the elderly, and individuals with underlying health conditions such as HIV/AIDS, asthma, and chronic heart or lung diseases. In Saudi Arabia, seasonal influenza vaccination is recommended for internal pilgrims, particularly those at risk described above, and all health-care workers in the Hajj premises.\" https://www.saudiembassy.net/hajj-and-umrah-health-requirements Given concerns re: possible MERS-CoV transmission to pilgrims visiting Saudi Arabia, many countries have increased surveillance of respiratory illnesses in returning pilgrims (notably post Hajj), and in the 6 years since identification of the MERS-CoV, there have been no cases among returning Hajji\'s and just a handful of cases among individuals returning from having performed the Umrah pilgrimage. Movement and exposures of visiting Hajj pilgrims are controlled - camels are not permitted in the area where Hajjis are congregated, healthcare workers and others identified as contacts of confirmed MERS-CoV cases are not permitted to go to the Hajj area Studies addressing the etiologies of respiratory illnesses in returning Hajji\'s have identified influenza virus, respiratory syncytial virus, parainfluenza virus, adenovirus and rhinovirus (see Respiratory infections - UK: Hajj returnees 20151011.3706464 and Respiratory virus infections - Saudi Arabia: Hajj pilgrims 2012 20130730.1854631 ). Note that the incubation period for influenza ranges from 1-4 days so transmission during travel is a plausible event. The actual number of individual identified as ill enough to require medical treatment at a hospital seems to be between 19 and 27 (taking into account the additional 8 identified and mentioned by the New York City\'s mayor\'s spokesperson in a tweet). Presumably these individuals had a fever above 100 F (37.8 C) and cough. The extension of the count to approaching 100 individuals may include those with a cough, possibly in the early stages of illness (influenza frequently begins with a dry cough), and others possibly coughing in response to hearing others coughing around them (think theater or lectures when coughing begins.) As the rapid respiratory virus screening tests should be available in a reasonable amount of time, we will post information as it becomes available, but should any knowledgeable sources have additional information that can be shared with us, ProMED would be very grateful. The HealthMap/ProMED maps can be found at: New York State, United States: https://promedmail.org/promed-post?place=6009759,234 Middle East: https://promedmail.org/promed-post?place=6009759,12214 . Mod.MPP] See Also MERS-CoV (29): UK (England) ex Saudi Arabia, Risk Assessment ECDC 20180830.5996187 2017 . MERS-CoV (63): Saudi Arabia (QS, TB), WHO : 20171009.5369268 2016 . Health hazards - Saudi Arabia: Hajj, Umrah, vaccination requirements 20160715.4346367 MERS-CoV (71): Saudi Arabia (MK), pilgrimage caution, WHO 20160623.4305152 2015 . MERS-CoV (138): Saudi Arabia, Jordan, post Hajj surveillance, RFI 20151009.3704734 MERS-CoV (136): Kuwait WHO, Saudi Arabia MOH, camel, Hajj 20150924.3666811 MERS-CoV (131): Saudi Arabia, animal reservoir, camels, Hajj, RFI 20150914.3643612 MERS-CoV (130): Saudi Arabia, animal reservoir, camels, Hajj 20150912.3641457 MERS-CoV (114): Saudi Arabia, animal reservoir, camels, Hajj 20150823.3597358 Respiratory infections - UK: Hajj returnees 20151011.3706464 Respiratory infections - Canada: (AB) Hajj returnees 20151020.3729641 Influenza (51): Germany ex Saudi Arabia, Hajj returnee, RFI 20151009.3704297 Influenza (49): Canada ex Saudi Arabia, Hajj returnees, susp., RFI, Alert 20151005.3693052 2014 . Meningitis, meningococcal - Saudi Arabia: prevention, Hajj travel advice 20140815.2692227 2013 . Respiratory virus infections - Saudi Arabia: Hajj pilgrims 2012 20130730.1854631 2012 . Health hazards - Saudi Arabia: updated Hajj advice 20121011.1338172 2009 . Influenza pandemic (H1N1) 2009 (113): Saudi Arabia, Hajj fatalities 20091122.4013 Influenza pandemic (H1N1) 2009 (109): Saudi Arabia, Hajj pilgrims 20091120.3997 2006 . Influenza - Saudi Arabia: Hajj concerns 20061209.3478 .mpp/ao/mpp",
|
818 |
+
# "{\"virus\": \"None\", \"country\": \"United Arab Emirates\", \"date\": \"2018-09-05\", \"cases\": \"19\", \"deaths\": \"None\"}"]]
|
819 |
+
|
820 |
+
if InContextExamples:
|
821 |
+
ntotExamplesTokens = 0
|
822 |
+
for row in InContextExamples:
|
823 |
+
for col in row:
|
824 |
+
nt = token_counter(col, model_name)
|
825 |
+
#print("\nNumber of Tokens in the example = " + str(nt))
|
826 |
+
ntotExamplesTokens = ntotExamplesTokens + nt
|
827 |
+
#
|
828 |
+
print("\nNumber of Tokens of the all examples = " + str(ntotExamplesTokens))
|
829 |
+
|
830 |
+
###
|
831 |
+
|
832 |
+
if service_provider == "openai":
|
833 |
+
if USE_CACHE:
|
834 |
+
lambda_model = partial(call_model_with_caching, prompt=myPromt, model=model_name,
|
835 |
+
temperature=temperature_value, delimiter=myDelimiter, InContextExamples=InContextExamples, handler=api_call_openai,
|
836 |
+
map_query_input_output=load_map_query_input_output, cache_fp=cache_name, verbose=True)
|
837 |
+
else:
|
838 |
+
lambda_model = partial(call_model, prompt=myPromt, model=model_name,
|
839 |
+
temperature=temperature_value, delimiter=myDelimiter, InContextExamples=InContextExamples, handler=api_call_openai,
|
840 |
+
verbose=True)
|
841 |
+
elif service_provider == "gptjrc":
|
842 |
+
if USE_CACHE:
|
843 |
+
lambda_model = partial(call_model_with_caching, prompt=myPromt, model=model_name,
|
844 |
+
temperature=temperature_value, delimiter=myDelimiter, InContextExamples=InContextExamples, handler=api_call_gptjrc,
|
845 |
+
map_query_input_output=load_map_query_input_output, cache_fp=cache_name, verbose=True)
|
846 |
+
else:
|
847 |
+
lambda_model = partial(call_model, prompt=myPromt, model=model_name,
|
848 |
+
temperature=temperature_value, delimiter=myDelimiter, InContextExamples=InContextExamples, handler=api_call_gptjrc,
|
849 |
+
verbose=True)
|
850 |
+
# elif service_provider == "dglc":
|
851 |
+
# if USE_CACHE:
|
852 |
+
# lambda_model = partial(call_model_with_caching, prompt=myPromt, model=model_name, temperature=temperature_value, delimiter=myDelimiter, InContextExamples=InContextExamples,
|
853 |
+
# handler=api_call_dglc, map_query_input_output=load_map_query_input_output, cache_fp=cache_name, verbose=True)
|
854 |
+
# else:
|
855 |
+
# lambda_model = partial(call_model, prompt=myPromt, model=model_name,
|
856 |
+
# temperature=temperature_value, delimiter=myDelimiter, InContextExamples=InContextExamples, handler=api_call_dglc,
|
857 |
+
# verbose=True)
|
858 |
+
|
859 |
+
# elif service_provider == "HFonPremises":
|
860 |
+
# if USE_CACHE:
|
861 |
+
# lambda_model = partial(call_model_with_caching, prompt=myPromt, model=model_name,
|
862 |
+
# temperature=temperature_value, delimiter=myDelimiter, InContextExamples=InContextExamples, handler=api_call_HFonPremises,
|
863 |
+
# map_query_input_output=load_map_query_input_output, cache_fp=cache_name, verbose=True)
|
864 |
+
# else:
|
865 |
+
# lambda_model = partial(call_model, prompt=myPromt, model=model_name,
|
866 |
+
# temperature=temperature_value, delimiter=myDelimiter, InContextExamples=InContextExamples, handler=api_call_HFonPremises,
|
867 |
+
# verbose=True)
|
868 |
+
|
869 |
+
|
870 |
+
|
871 |
+
if lambda_model:
|
872 |
+
df = pd.DataFrame([["one, two, three, a step fortward Mary"], ["one, two, three, a step back"]], columns=["text"])
|
873 |
+
df["text_es"] = df["text"].apply(lambda_model)
|
874 |
+
|
875 |
+
print("\n")
|
876 |
+
print(df)
|
877 |
+
|
878 |
+
print("\nEnd Computations")
|
879 |
+
|
880 |
+
|
881 |
+
|
nerBio.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
retrieverRAG_testing.py
ADDED
@@ -0,0 +1,339 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# https://www.mixedbread.ai/blog/mxbai-embed-large-v1
|
2 |
+
# https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1
|
3 |
+
|
4 |
+
import os
|
5 |
+
import time
|
6 |
+
import pandas as pd
|
7 |
+
import numpy as np
|
8 |
+
from typing import Dict
|
9 |
+
|
10 |
+
import torch
|
11 |
+
from transformers import AutoModel, AutoTokenizer
|
12 |
+
from sentence_transformers.util import cos_sim
|
13 |
+
from accelerate import Accelerator # Import from accelerate
|
14 |
+
from scipy.stats import zscore
|
15 |
+
|
16 |
+
# Set up environment variables for Hugging Face caching
|
17 |
+
os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
|
18 |
+
os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
|
19 |
+
os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
|
20 |
+
|
21 |
+
# Initialize the Accelerator
|
22 |
+
accelerator = Accelerator()
|
23 |
+
|
24 |
+
# Use the device managed by Accelerator
|
25 |
+
device = accelerator.device
|
26 |
+
print("Using accelerator device =", device)
|
27 |
+
|
28 |
+
|
29 |
+
# 1. Load the model and tokenizer
|
30 |
+
model_id_Retriever = 'mixedbread-ai/mxbai-embed-large-v1'
|
31 |
+
tokenizer_Retriever = AutoTokenizer.from_pretrained(model_id_Retriever)
|
32 |
+
modelRetriever = AutoModel.from_pretrained(model_id_Retriever)
|
33 |
+
|
34 |
+
# Accelerate prepares the model (e.g., moves to the appropriate device)
|
35 |
+
modelRetriever = accelerator.prepare(modelRetriever)
|
36 |
+
|
37 |
+
|
38 |
+
|
39 |
+
|
40 |
+
# Define the transform_query function
|
41 |
+
def transform_query(queryText: str) -> str:
|
42 |
+
"""For retrieval, add the prompt for queryText (not for documents)."""
|
43 |
+
return f'Represent this sentence for searching relevant passages: {queryText}'
|
44 |
+
|
45 |
+
# Define the pooling function
|
46 |
+
def pooling(outputs: torch.Tensor, inputs: Dict, strategy: str = 'cls') -> np.ndarray:
|
47 |
+
if strategy == 'cls':
|
48 |
+
outputs = outputs[:, 0]
|
49 |
+
elif strategy == 'mean':
|
50 |
+
outputs = torch.sum(
|
51 |
+
outputs * inputs["attention_mask"][:, :, None], dim=1
|
52 |
+
) / torch.sum(inputs["attention_mask"], dim=1, keepdim=True)
|
53 |
+
else:
|
54 |
+
raise NotImplementedError
|
55 |
+
return outputs.detach().cpu().numpy()
|
56 |
+
|
57 |
+
|
58 |
+
def retrievePassageSimilarities(queryText, passages):
|
59 |
+
# Create the docs list by adding the transformed queryText and then the passages
|
60 |
+
docs = [transform_query(queryText)] + passages
|
61 |
+
|
62 |
+
# 2. Encode the inputs
|
63 |
+
inputs = tokenizer_Retriever(docs, padding=True, return_tensors='pt')
|
64 |
+
|
65 |
+
# Move inputs to the right device using accelerator
|
66 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
67 |
+
with torch.no_grad():
|
68 |
+
outputs = modelRetriever(**inputs).last_hidden_state
|
69 |
+
embeddings = pooling(outputs, inputs, 'cls')
|
70 |
+
|
71 |
+
similarities = cos_sim(embeddings[0], embeddings[1:])
|
72 |
+
|
73 |
+
#print('similarities:', similarities)
|
74 |
+
|
75 |
+
return similarities
|
76 |
+
|
77 |
+
|
78 |
+
|
79 |
+
def RAG_retrieval_Base(queryText,passages, min_threshold=0.0, max_num_passages=None):
|
80 |
+
|
81 |
+
try:
|
82 |
+
similarities=retrievePassageSimilarities(queryText, passages)
|
83 |
+
|
84 |
+
#Create a DataFrame
|
85 |
+
df = pd.DataFrame({
|
86 |
+
'Passage': passages,
|
87 |
+
'Similarity': similarities.flatten() # Flatten the similarity tensor/array to ensure compatibility
|
88 |
+
})
|
89 |
+
|
90 |
+
# Filter the DataFrame based on the similarity threshold
|
91 |
+
df_filtered = df[df['Similarity'] >= min_threshold]
|
92 |
+
|
93 |
+
# If max_num_passages is specified, limit the number of passages returned
|
94 |
+
if max_num_passages is not None:
|
95 |
+
df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity')
|
96 |
+
|
97 |
+
df_filtered = df_filtered.sort_values(by='Similarity', ascending=False)
|
98 |
+
|
99 |
+
# Return the filtered DataFrame
|
100 |
+
return df_filtered
|
101 |
+
|
102 |
+
except Exception as e:
|
103 |
+
# Log the exception message or handle it as needed
|
104 |
+
print(f"An error occurred: {e}")
|
105 |
+
return pd.DataFrame() # Return an empty DataFrame in case of error
|
106 |
+
|
107 |
+
|
108 |
+
|
109 |
+
def RAG_retrieval_Z_scores(queryText, passages, z_threshold=1.0, max_num_passages=None, min_threshold=0.5):
|
110 |
+
try:
|
111 |
+
# Encoding and similarity computation remains the same
|
112 |
+
|
113 |
+
similarities = retrievePassageSimilarities(queryText, passages)
|
114 |
+
|
115 |
+
# Calculate z-scores for similarities
|
116 |
+
z_scores = zscore(similarities.flatten())
|
117 |
+
|
118 |
+
# Create a DataFrame with passages, similarities, and z-scores
|
119 |
+
df = pd.DataFrame({
|
120 |
+
'Passage': passages,
|
121 |
+
'Similarity': similarities.flatten(),
|
122 |
+
'Z-Score': z_scores
|
123 |
+
})
|
124 |
+
|
125 |
+
# Filter passages based on z-score threshold
|
126 |
+
df_filtered = df[df['Z-Score'] >= z_threshold]
|
127 |
+
|
128 |
+
if min_threshold:
|
129 |
+
# Filter the DataFrame also on min similarity threshold
|
130 |
+
df_filtered = df[df['Similarity'] >= min_threshold]
|
131 |
+
|
132 |
+
# If max_num_passages is specified, limit the number of passages returned
|
133 |
+
if max_num_passages is not None:
|
134 |
+
df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity')
|
135 |
+
|
136 |
+
# Sort by similarity (or z-score if preferred)
|
137 |
+
df_filtered = df_filtered.sort_values(by='Similarity', ascending=False)
|
138 |
+
|
139 |
+
return df_filtered
|
140 |
+
|
141 |
+
except Exception as e:
|
142 |
+
# Log the exception message or handle it as needed
|
143 |
+
print(f"An error occurred: {e}")
|
144 |
+
return pd.DataFrame() # Return an empty DataFrame in case of error
|
145 |
+
|
146 |
+
|
147 |
+
|
148 |
+
|
149 |
+
def RAG_retrieval_Percentile(queryText, passages, percentile=90,max_num_passages=None, min_threshold=0.5):
|
150 |
+
try:
|
151 |
+
# Encoding and similarity computation remains the same
|
152 |
+
|
153 |
+
similarities = retrievePassageSimilarities(queryText, passages)
|
154 |
+
|
155 |
+
# Determine threshold based on percentile
|
156 |
+
threshold = np.percentile(similarities.flatten(), percentile)
|
157 |
+
|
158 |
+
# Create a DataFrame
|
159 |
+
df = pd.DataFrame({
|
160 |
+
'Passage': passages,
|
161 |
+
'Similarity': similarities.flatten()
|
162 |
+
})
|
163 |
+
|
164 |
+
# Filter using percentile threshold
|
165 |
+
df_filtered = df[df['Similarity'] >= threshold]
|
166 |
+
|
167 |
+
if min_threshold:
|
168 |
+
# Filter the DataFrame also on min similarity threshold
|
169 |
+
df_filtered = df[df['Similarity'] >= min_threshold]
|
170 |
+
|
171 |
+
# If max_num_passages is specified, limit the number of passages returned
|
172 |
+
if max_num_passages is not None:
|
173 |
+
df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity')
|
174 |
+
|
175 |
+
# Sort by similarity
|
176 |
+
df_filtered = df_filtered.sort_values(by='Similarity', ascending=False)
|
177 |
+
|
178 |
+
return df_filtered
|
179 |
+
|
180 |
+
except Exception as e:
|
181 |
+
# Log the exception message or handle it as needed
|
182 |
+
print(f"An error occurred: {e}")
|
183 |
+
return pd.DataFrame() # Return an empty DataFrame in case of error
|
184 |
+
|
185 |
+
|
186 |
+
|
187 |
+
def RAG_retrieval_TopK(queryText, passages, top_fraction=0.1, max_num_passages=None, min_threshold=0.5):
|
188 |
+
try:
|
189 |
+
# Encoding and similarity computation (assuming retrievePassageSimilarities is defined elsewhere)
|
190 |
+
similarities = retrievePassageSimilarities(queryText, passages)
|
191 |
+
|
192 |
+
# Calculate the number of passages to select based on top fraction
|
193 |
+
num_passages_TopFraction = max(1, int(top_fraction * len(passages)))
|
194 |
+
|
195 |
+
# Create a DataFrame
|
196 |
+
df = pd.DataFrame({
|
197 |
+
'Passage': passages,
|
198 |
+
'Similarity': similarities.flatten()
|
199 |
+
})
|
200 |
+
|
201 |
+
# Select the top passages dynamically
|
202 |
+
df_filtered = df.nlargest(num_passages_TopFraction, 'Similarity')
|
203 |
+
|
204 |
+
if min_threshold:
|
205 |
+
# Filter the DataFrame also on min similarity threshold
|
206 |
+
df_filtered = df_filtered[df_filtered['Similarity'] >= min_threshold]
|
207 |
+
|
208 |
+
# If max_num_passages is specified, limit the number of passages returned
|
209 |
+
if max_num_passages is not None:
|
210 |
+
df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity')
|
211 |
+
|
212 |
+
# Sort by similarity
|
213 |
+
df_filtered = df_filtered.sort_values(by='Similarity', ascending=False)
|
214 |
+
|
215 |
+
return df_filtered
|
216 |
+
|
217 |
+
except Exception as e:
|
218 |
+
# Log the exception message or handle it as needed
|
219 |
+
print(f"An error occurred: {e}")
|
220 |
+
return pd.DataFrame() # Return an empty DataFrame in case of error
|
221 |
+
|
222 |
+
|
223 |
+
|
224 |
+
if __name__ == '__main__':
|
225 |
+
|
226 |
+
queryText = 'A man is eating a piece of bread'
|
227 |
+
|
228 |
+
# Define the passages list
|
229 |
+
passages = [
|
230 |
+
"A man is eating food.",
|
231 |
+
"A man is eating pasta.",
|
232 |
+
"The girl is carrying a baby.",
|
233 |
+
"A man is riding a horse.",
|
234 |
+
]
|
235 |
+
|
236 |
+
#df_retrieved = RAG_retrieval_Base(queryText, passages)
|
237 |
+
#df_retrieved = RAG_retrieval_Base(queryText, passages, min_threshold=0.5)
|
238 |
+
#df_retrieved = RAG_retrieval_Base(queryText, passages, max_num_passages=3)
|
239 |
+
df_retrieved = RAG_retrieval_Base(queryText, passages, min_threshold=0.5, max_num_passages=3)
|
240 |
+
|
241 |
+
#df_retrieved = RAG_retrieval_Z_scores(queryText, passages, z_threshold=1.0)
|
242 |
+
#df_retrieved = RAG_retrieval_Z_scores(queryText, passages, z_threshold=1.0,max_num_passages=3)
|
243 |
+
|
244 |
+
#df_retrieved = RAG_retrieval_Percentile(queryText, passages, percentile=80)
|
245 |
+
# df_retrieved = RAG_retrieval_Percentile(queryText, passages, percentile=80, max_num_passages=3)
|
246 |
+
|
247 |
+
##df_retrieved = RAG_retrieval_TopK(queryText, passages, top_fraction=0.2)
|
248 |
+
#df_retrieved = RAG_retrieval_TopK(queryText, passages, top_fraction=0.2, max_num_passages=3)
|
249 |
+
|
250 |
+
|
251 |
+
print(df_retrieved)
|
252 |
+
|
253 |
+
#labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
|
254 |
+
|
255 |
+
|
256 |
+
print("end of computations")
|
257 |
+
|
258 |
+
# VERSION WITHOUT ACCELERATE
|
259 |
+
#
|
260 |
+
# #https://www.mixedbread.ai/blog/mxbai-embed-large-v1
|
261 |
+
# #https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1
|
262 |
+
#
|
263 |
+
# import os
|
264 |
+
#
|
265 |
+
# os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
|
266 |
+
# os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
|
267 |
+
# os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
|
268 |
+
#
|
269 |
+
# import time
|
270 |
+
# import pandas as pd
|
271 |
+
# import numpy as np
|
272 |
+
#
|
273 |
+
#
|
274 |
+
#
|
275 |
+
# from typing import Dict
|
276 |
+
#
|
277 |
+
# import torch
|
278 |
+
# import numpy as np
|
279 |
+
# from transformers import AutoModel, AutoTokenizer
|
280 |
+
# from sentence_transformers.util import cos_sim
|
281 |
+
#
|
282 |
+
# # For retrieval you need to pass this prompt. Please find our more in our blog post.
|
283 |
+
# def transform_queryText(queryText: str) -> str:
|
284 |
+
# """ For retrieval, add the prompt for queryText (not for documents).
|
285 |
+
# """
|
286 |
+
# return f'Represent this sentence for searching relevant passages: {queryText}'
|
287 |
+
#
|
288 |
+
# # The model works really well with cls pooling (default) but also with mean pooling.
|
289 |
+
# def pooling(outputs: torch.Tensor, inputs: Dict, strategy: str = 'cls') -> np.ndarray:
|
290 |
+
# if strategy == 'cls':
|
291 |
+
# outputs = outputs[:, 0]
|
292 |
+
# elif strategy == 'mean':
|
293 |
+
# outputs = torch.sum(
|
294 |
+
# outputs * inputs["attention_mask"][:, :, None], dim=1) / torch.sum(inputs["attention_mask"], dim=1, keepdim=True)
|
295 |
+
# else:
|
296 |
+
# raise NotImplementedError
|
297 |
+
# return outputs.detach().cpu().numpy()
|
298 |
+
#
|
299 |
+
# # 1. load model
|
300 |
+
# model_id = 'mixedbread-ai/mxbai-embed-large-v1'
|
301 |
+
# tokenizer = AutoTokenizer.from_pretrained(model_id)
|
302 |
+
# model = AutoModel.from_pretrained(model_id).cuda()
|
303 |
+
#
|
304 |
+
# queryText = 'A man is eating a piece of bread'
|
305 |
+
#
|
306 |
+
# # Define the passages list
|
307 |
+
# passages = [
|
308 |
+
# "A man is eating food.",
|
309 |
+
# "A man is eating pasta.",
|
310 |
+
# "The girl is carrying a baby.",
|
311 |
+
# "A man is riding a horse.",
|
312 |
+
# ]
|
313 |
+
#
|
314 |
+
# # Create the docs list by adding the transformed queryText and then the passages
|
315 |
+
# docs = [transform_queryText(queryText)] + passages
|
316 |
+
#
|
317 |
+
# # 2. encode
|
318 |
+
# inputs = tokenizer(docs, padding=True, return_tensors='pt')
|
319 |
+
# for k, v in inputs.items():
|
320 |
+
# inputs[k] = v.cuda()
|
321 |
+
# outputs = model(**inputs).last_hidden_state
|
322 |
+
# embeddings = pooling(outputs, inputs, 'cls')
|
323 |
+
#
|
324 |
+
# similarities = cos_sim(embeddings[0], embeddings[1:])
|
325 |
+
#
|
326 |
+
# print('similarities:', similarities)
|
327 |
+
#
|
328 |
+
#
|
329 |
+
# # Create a DataFrame
|
330 |
+
# df = pd.DataFrame({
|
331 |
+
# 'Passage': passages,
|
332 |
+
# 'Similarity': similarities.flatten() # Flatten the similarity tensor/array to ensure compatibility
|
333 |
+
# })
|
334 |
+
#
|
335 |
+
# # Display the DataFrame
|
336 |
+
# print(df)
|
337 |
+
#
|
338 |
+
#
|
339 |
+
# print("end of computations")
|
virtuosoQueryRest.py
CHANGED
@@ -3,11 +3,14 @@ from requests.auth import HTTPDigestAuth, HTTPBasicAuth
|
|
3 |
import ssl
|
4 |
import json
|
5 |
|
|
|
6 |
|
|
|
|
|
7 |
|
8 |
|
9 |
|
10 |
-
|
11 |
def execute_query(endpoint, query, auth):
|
12 |
headers = {
|
13 |
'Content-Type': 'application/x-www-form-urlencoded',
|
@@ -144,32 +147,26 @@ if __name__ == '__main__':
|
|
144 |
|
145 |
#############
|
146 |
|
147 |
-
#query = 'SELECT * WHERE { ?s ?p ?o } LIMIT 100'
|
148 |
-
|
149 |
-
# word ="acute sinusitis"
|
150 |
-
# query = f"""
|
151 |
-
# SELECT ?concept ?label (COUNT(?edge) AS ?score)
|
152 |
-
# WHERE {{
|
153 |
-
# ?concept skos:prefLabel|rdfs:label|skos:altLabel|obo:hasRelatedSynonym ?label .
|
154 |
-
# FILTER (LCASE(STR(?label)) = "{word}")
|
155 |
-
# ?concept ?edge ?o .
|
156 |
-
# }}
|
157 |
-
# GROUP BY ?concept ?label
|
158 |
-
# ORDER BY DESC(?score)
|
159 |
-
# """
|
160 |
|
161 |
-
choices = ['SNOMED', 'LOINC', 'ICD10', 'MESH', 'NCIT'] # restricts the input to these values only
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
# Construct the FROM clauses
|
164 |
from_clauses = ' '.join([f"FROM <{choice}>" for choice in choices])
|
165 |
|
166 |
-
word = "acute sinusitis"
|
|
|
167 |
# Construct the full SPARQL query
|
168 |
query = f"""
|
|
|
169 |
SELECT ?concept ?label (COUNT(?edge) AS ?score)
|
170 |
{from_clauses}
|
171 |
WHERE {{
|
172 |
-
?concept skos:prefLabel|rdfs:label|skos:altLabel|obo:hasRelatedSynonym ?label .
|
173 |
FILTER (LCASE(STR(?label)) = "{word.lower()}")
|
174 |
?concept ?edge ?o .
|
175 |
}}
|
|
|
3 |
import ssl
|
4 |
import json
|
5 |
|
6 |
+
from joblib import Memory
|
7 |
|
8 |
+
cachedir = 'cached'
|
9 |
+
mem = Memory(cachedir, verbose=False)
|
10 |
|
11 |
|
12 |
|
13 |
+
@mem.cache
|
14 |
def execute_query(endpoint, query, auth):
|
15 |
headers = {
|
16 |
'Content-Type': 'application/x-www-form-urlencoded',
|
|
|
147 |
|
148 |
#############
|
149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
+
#choices = ['SNOMED', 'LOINC', 'ICD10', 'MESH', 'NCIT'] # restricts the input to these values only
|
152 |
+
choices = ["AI", "AIO", "AEO", "BFO", "BIM", "BCGO", "CL", "CHIRO", "CHEBI", "DCM", "FMA", "GO", "GENO",
|
153 |
+
"GeoSPARQL", "HL7", "DOID", "HP", "HP_O", "IDO", "IAO", "ICD10", "LOINC", "MESH",
|
154 |
+
"MONDO", "NCIT", "NCBITAXON", "NCBITaxon_", "NIFCELL", "NIFSTD", "GML", "OBCS", "OCHV", "OHPI",
|
155 |
+
"OPB", "TRANS", "PLOSTHES", "RADLEX", "RO", "STY", "SO", "SNOMED", "STATO",
|
156 |
+
"SYMP", "FoodOn", "UBERON", "ORDO", "HOOM", "VO", "OGMS", "EuroSciVoc"]
|
157 |
|
158 |
# Construct the FROM clauses
|
159 |
from_clauses = ' '.join([f"FROM <{choice}>" for choice in choices])
|
160 |
|
161 |
+
#word = "acute sinusitis"
|
162 |
+
word = "pure mathematics"
|
163 |
# Construct the full SPARQL query
|
164 |
query = f"""
|
165 |
+
prefix skosxl: <http://www.w3.org/2008/05/skos-xl#>
|
166 |
SELECT ?concept ?label (COUNT(?edge) AS ?score)
|
167 |
{from_clauses}
|
168 |
WHERE {{
|
169 |
+
?concept skos:prefLabel|rdfs:label|skos:altLabel|skosxl:literalForm|obo:hasRelatedSynonym ?label .
|
170 |
FILTER (LCASE(STR(?label)) = "{word.lower()}")
|
171 |
?concept ?edge ?o .
|
172 |
}}
|