Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -8,38 +8,89 @@ import logging
|
|
8 |
from datasets import load_dataset
|
9 |
from nltk.tokenize import sent_tokenize
|
10 |
import nltk
|
|
|
11 |
|
12 |
# Set up logging
|
13 |
logging.basicConfig(level=logging.INFO)
|
14 |
logger = logging.getLogger(__name__)
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
# Load the ragbench datasets
|
17 |
ragbench = {}
|
18 |
for dataset in ['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa', 'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa', 'tatqa', 'techqa']:
|
19 |
ragbench[dataset] = load_dataset("rungalileo/ragbench", dataset)
|
20 |
logger.info(f"Loaded {dataset}")
|
21 |
|
22 |
-
# Initialize with a stronger model
|
23 |
model_name = 'sentence-transformers/all-mpnet-base-v2'
|
24 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
25 |
embedding_model = HuggingFaceEmbeddings(model_name=model_name)
|
26 |
embedding_model.client.to(device)
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
def process_query(query, dataset_choice):
|
29 |
try:
|
30 |
logger.info(f"Processing query for {dataset_choice}: {query}")
|
31 |
|
32 |
-
# Get relevant documents specific to the chosen dataset
|
33 |
relevant_docs = vectordb.max_marginal_relevance_search(
|
34 |
query,
|
35 |
-
k=5,
|
36 |
-
fetch_k=10
|
37 |
)
|
38 |
|
39 |
context = " ".join([doc.page_content for doc in relevant_docs])
|
40 |
|
41 |
response = openai.chat.completions.create(
|
42 |
-
model="gpt-
|
43 |
messages=[
|
44 |
{"role": "system", "content": "You are a specialized assistant for the RagBench dataset. Provide precise answers based solely on the given context."},
|
45 |
{"role": "user", "content": f"Dataset: {dataset_choice}\nContext: {context}\nQuestion: {query}\n\nProvide a detailed answer using only the information from the context above."}
|
|
|
8 |
from datasets import load_dataset
|
9 |
from nltk.tokenize import sent_tokenize
|
10 |
import nltk
|
11 |
+
from langchain.docstore.document import Document
|
12 |
|
13 |
# Set up logging
|
14 |
logging.basicConfig(level=logging.INFO)
|
15 |
logger = logging.getLogger(__name__)
|
16 |
|
17 |
+
# Initialize OpenAI API key
|
18 |
+
openai.api_key = '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' # Replace with your API key
|
19 |
+
|
20 |
+
# Download NLTK data
|
21 |
+
nltk.download('punkt')
|
22 |
+
|
23 |
# Load the ragbench datasets
|
24 |
ragbench = {}
|
25 |
for dataset in ['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa', 'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa', 'tatqa', 'techqa']:
|
26 |
ragbench[dataset] = load_dataset("rungalileo/ragbench", dataset)
|
27 |
logger.info(f"Loaded {dataset}")
|
28 |
|
29 |
+
# Initialize with a stronger model
|
30 |
model_name = 'sentence-transformers/all-mpnet-base-v2'
|
31 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
32 |
embedding_model = HuggingFaceEmbeddings(model_name=model_name)
|
33 |
embedding_model.client.to(device)
|
34 |
|
35 |
+
# Chunking function
|
36 |
+
def chunk_documents_semantic(documents, max_chunk_size=500):
|
37 |
+
chunks = []
|
38 |
+
for doc in documents:
|
39 |
+
if isinstance(doc, list):
|
40 |
+
for passage in doc:
|
41 |
+
sentences = sent_tokenize(passage)
|
42 |
+
current_chunk = ""
|
43 |
+
for sentence in sentences:
|
44 |
+
if len(current_chunk) + len(sentence) <= max_chunk_size:
|
45 |
+
current_chunk += sentence + " "
|
46 |
+
else:
|
47 |
+
chunks.append(current_chunk.strip())
|
48 |
+
current_chunk = sentence + " "
|
49 |
+
if current_chunk:
|
50 |
+
chunks.append(current_chunk.strip())
|
51 |
+
else:
|
52 |
+
sentences = sent_tokenize(doc)
|
53 |
+
current_chunk = ""
|
54 |
+
for sentence in sentences:
|
55 |
+
if len(current_chunk) + len(sentence) <= max_chunk_size:
|
56 |
+
current_chunk += sentence + " "
|
57 |
+
else:
|
58 |
+
chunks.append(current_chunk.strip())
|
59 |
+
current_chunk = sentence + " "
|
60 |
+
if current_chunk:
|
61 |
+
chunks.append(current_chunk.strip())
|
62 |
+
return chunks
|
63 |
+
|
64 |
+
# Process documents and create vectordb
|
65 |
+
documents = []
|
66 |
+
for dataset_name in ragbench.keys():
|
67 |
+
for split in ragbench[dataset_name].keys():
|
68 |
+
original_documents = ragbench[dataset_name][split]['documents']
|
69 |
+
chunked_documents = chunk_documents_semantic(original_documents)
|
70 |
+
documents.extend([Document(page_content=chunk) for chunk in chunked_documents])
|
71 |
+
|
72 |
+
# Initialize vectordb with processed documents
|
73 |
+
vectordb = Chroma.from_documents(
|
74 |
+
documents=documents,
|
75 |
+
embedding=embedding_model,
|
76 |
+
persist_directory='./docs/chroma/'
|
77 |
+
)
|
78 |
+
vectordb.persist()
|
79 |
+
|
80 |
def process_query(query, dataset_choice):
|
81 |
try:
|
82 |
logger.info(f"Processing query for {dataset_choice}: {query}")
|
83 |
|
|
|
84 |
relevant_docs = vectordb.max_marginal_relevance_search(
|
85 |
query,
|
86 |
+
k=5,
|
87 |
+
fetch_k=10
|
88 |
)
|
89 |
|
90 |
context = " ".join([doc.page_content for doc in relevant_docs])
|
91 |
|
92 |
response = openai.chat.completions.create(
|
93 |
+
model="gpt-3.5-turbo",
|
94 |
messages=[
|
95 |
{"role": "system", "content": "You are a specialized assistant for the RagBench dataset. Provide precise answers based solely on the given context."},
|
96 |
{"role": "user", "content": f"Dataset: {dataset_choice}\nContext: {context}\nQuestion: {query}\n\nProvide a detailed answer using only the information from the context above."}
|