ajalisatgi commited on
Commit
4beb772
·
verified ·
1 Parent(s): 8dfd657

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -5
app.py CHANGED
@@ -8,38 +8,89 @@ import logging
8
  from datasets import load_dataset
9
  from nltk.tokenize import sent_tokenize
10
  import nltk
 
11
 
12
  # Set up logging
13
  logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
15
 
 
 
 
 
 
 
16
  # Load the ragbench datasets
17
  ragbench = {}
18
  for dataset in ['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa', 'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa', 'tatqa', 'techqa']:
19
  ragbench[dataset] = load_dataset("rungalileo/ragbench", dataset)
20
  logger.info(f"Loaded {dataset}")
21
 
22
- # Initialize with a stronger model for better semantic understanding
23
  model_name = 'sentence-transformers/all-mpnet-base-v2'
24
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
  embedding_model = HuggingFaceEmbeddings(model_name=model_name)
26
  embedding_model.client.to(device)
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def process_query(query, dataset_choice):
29
  try:
30
  logger.info(f"Processing query for {dataset_choice}: {query}")
31
 
32
- # Get relevant documents specific to the chosen dataset
33
  relevant_docs = vectordb.max_marginal_relevance_search(
34
  query,
35
- k=5, # Top 5 most relevant documents
36
- fetch_k=10 # Fetch top 10 then select most diverse 5
37
  )
38
 
39
  context = " ".join([doc.page_content for doc in relevant_docs])
40
 
41
  response = openai.chat.completions.create(
42
- model="gpt-4",
43
  messages=[
44
  {"role": "system", "content": "You are a specialized assistant for the RagBench dataset. Provide precise answers based solely on the given context."},
45
  {"role": "user", "content": f"Dataset: {dataset_choice}\nContext: {context}\nQuestion: {query}\n\nProvide a detailed answer using only the information from the context above."}
 
8
  from datasets import load_dataset
9
  from nltk.tokenize import sent_tokenize
10
  import nltk
11
+ from langchain.docstore.document import Document
12
 
13
  # Set up logging
14
  logging.basicConfig(level=logging.INFO)
15
  logger = logging.getLogger(__name__)
16
 
17
+ # Initialize OpenAI API key
18
+ openai.api_key = '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' # Replace with your API key
19
+
20
+ # Download NLTK data
21
+ nltk.download('punkt')
22
+
23
  # Load the ragbench datasets
24
  ragbench = {}
25
  for dataset in ['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa', 'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa', 'tatqa', 'techqa']:
26
  ragbench[dataset] = load_dataset("rungalileo/ragbench", dataset)
27
  logger.info(f"Loaded {dataset}")
28
 
29
+ # Initialize with a stronger model
30
  model_name = 'sentence-transformers/all-mpnet-base-v2'
31
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
32
  embedding_model = HuggingFaceEmbeddings(model_name=model_name)
33
  embedding_model.client.to(device)
34
 
35
+ # Chunking function
36
+ def chunk_documents_semantic(documents, max_chunk_size=500):
37
+ chunks = []
38
+ for doc in documents:
39
+ if isinstance(doc, list):
40
+ for passage in doc:
41
+ sentences = sent_tokenize(passage)
42
+ current_chunk = ""
43
+ for sentence in sentences:
44
+ if len(current_chunk) + len(sentence) <= max_chunk_size:
45
+ current_chunk += sentence + " "
46
+ else:
47
+ chunks.append(current_chunk.strip())
48
+ current_chunk = sentence + " "
49
+ if current_chunk:
50
+ chunks.append(current_chunk.strip())
51
+ else:
52
+ sentences = sent_tokenize(doc)
53
+ current_chunk = ""
54
+ for sentence in sentences:
55
+ if len(current_chunk) + len(sentence) <= max_chunk_size:
56
+ current_chunk += sentence + " "
57
+ else:
58
+ chunks.append(current_chunk.strip())
59
+ current_chunk = sentence + " "
60
+ if current_chunk:
61
+ chunks.append(current_chunk.strip())
62
+ return chunks
63
+
64
+ # Process documents and create vectordb
65
+ documents = []
66
+ for dataset_name in ragbench.keys():
67
+ for split in ragbench[dataset_name].keys():
68
+ original_documents = ragbench[dataset_name][split]['documents']
69
+ chunked_documents = chunk_documents_semantic(original_documents)
70
+ documents.extend([Document(page_content=chunk) for chunk in chunked_documents])
71
+
72
+ # Initialize vectordb with processed documents
73
+ vectordb = Chroma.from_documents(
74
+ documents=documents,
75
+ embedding=embedding_model,
76
+ persist_directory='./docs/chroma/'
77
+ )
78
+ vectordb.persist()
79
+
80
  def process_query(query, dataset_choice):
81
  try:
82
  logger.info(f"Processing query for {dataset_choice}: {query}")
83
 
 
84
  relevant_docs = vectordb.max_marginal_relevance_search(
85
  query,
86
+ k=5,
87
+ fetch_k=10
88
  )
89
 
90
  context = " ".join([doc.page_content for doc in relevant_docs])
91
 
92
  response = openai.chat.completions.create(
93
+ model="gpt-3.5-turbo",
94
  messages=[
95
  {"role": "system", "content": "You are a specialized assistant for the RagBench dataset. Provide precise answers based solely on the given context."},
96
  {"role": "user", "content": f"Dataset: {dataset_choice}\nContext: {context}\nQuestion: {query}\n\nProvide a detailed answer using only the information from the context above."}