thechaiexperiment commited on
Commit
12bd822
·
1 Parent(s): 22f5f6f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -36
app.py CHANGED
@@ -189,10 +189,6 @@ def query_embeddings(query_embedding, n_results=5):
189
  print(f"Error in query_embeddings: {e}")
190
  return []
191
 
192
- query_embedding = embed_query_text(query_text) # Embed the query text
193
- initial_results = query_embeddings(query_embedding, embeddings_data, n_results=5)
194
- document_ids = [doc_id for doc_id, _ in initial_results]
195
-
196
  def retrieve_document_text(doc_id):
197
  """Retrieve document text from HTML file"""
198
  try:
@@ -208,7 +204,6 @@ def retrieve_document_text(doc_id):
208
  print(f"Error retrieving document {doc_id}: {e}")
209
  return ""
210
 
211
- document_texts = retrieve_document_texts(document_ids, folder_path)
212
 
213
  def rerank_documents(query, doc_texts):
214
  """Rerank documents using cross-encoder"""
@@ -274,7 +269,6 @@ def extract_relevant_portions(document_texts, query, max_portions=3, portion_siz
274
 
275
  return relevant_portions
276
 
277
- relevant_portions = extract_relevant_portions(document_texts, query_text, max_portions=3, portion_size=1, min_query_words=1)
278
 
279
  def remove_duplicates(selected_parts):
280
  unique_sentences = set()
@@ -287,20 +281,6 @@ def remove_duplicates(selected_parts):
287
 
288
  return unique_selected_parts
289
 
290
- # Flatten the dictionary of relevant portions (from earlier code)
291
- flattened_relevant_portions = []
292
- for doc_id, portions in relevant_portions.items():
293
- flattened_relevant_portions.extend(portions)
294
-
295
- # Remove duplicate portions
296
- unique_selected_parts = remove_duplicates(flattened_relevant_portions)
297
-
298
- # Combine the unique parts into a single string of context
299
- combined_parts = " ".join(unique_selected_parts)
300
-
301
- # Construct context as a list: first the query, then the unique selected portions
302
- context = [query_text] + unique_selected_parts
303
-
304
  def extract_entities(text):
305
  inputs = biobert_tokenizer(text, return_tensors="pt")
306
  outputs = biobert_model(**inputs)
@@ -372,11 +352,6 @@ def remove_incomplete_sentence(text):
372
  return text[:last_period_index + 1].strip()
373
  return text
374
 
375
- answer_part = answer.split("Answer:")[-1].strip()
376
- cleaned_answer = remove_answer_prefix(answer_part)
377
- final_answer = remove_incomplete_sentence(cleaned_answer)
378
-
379
-
380
  @app.get("/")
381
  async def root():
382
  return {"message": "Welcome to the FastAPI application! Use the /health endpoint to check health, and /api/query for processing queries."}
@@ -397,20 +372,26 @@ async def health_check():
397
  async def chat_endpoint(chat_query: ChatQuery):
398
  try:
399
  query_text = chat_query.query
400
- query_embedding = models['embedding'].encode([query_text])
401
- relevant_docs = query_embeddings(query_embedding)
402
-
403
- doc_texts = [retrieve_document_text(doc_id) for doc_id, _ in relevant_docs]
404
- doc_texts = [text for text in doc_texts if text.strip()]
405
-
406
- rerank_scores = rerank_documents(query_text, doc_texts)
407
- ranked_texts = [text for _, text in sorted(zip(rerank_scores, doc_texts), reverse=True)]
408
-
409
  context = [query_text] + unique_selected_parts
410
- answer = remove_incomplete_sentence(query_text, context)
 
 
 
 
 
 
411
 
412
  return {
413
- "response": answer,
414
  "conversation_id": chat_query.conversation_id,
415
  "success": True
416
  }
 
189
  print(f"Error in query_embeddings: {e}")
190
  return []
191
 
 
 
 
 
192
  def retrieve_document_text(doc_id):
193
  """Retrieve document text from HTML file"""
194
  try:
 
204
  print(f"Error retrieving document {doc_id}: {e}")
205
  return ""
206
 
 
207
 
208
  def rerank_documents(query, doc_texts):
209
  """Rerank documents using cross-encoder"""
 
269
 
270
  return relevant_portions
271
 
 
272
 
273
  def remove_duplicates(selected_parts):
274
  unique_sentences = set()
 
281
 
282
  return unique_selected_parts
283
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  def extract_entities(text):
285
  inputs = biobert_tokenizer(text, return_tensors="pt")
286
  outputs = biobert_model(**inputs)
 
352
  return text[:last_period_index + 1].strip()
353
  return text
354
 
 
 
 
 
 
355
  @app.get("/")
356
  async def root():
357
  return {"message": "Welcome to the FastAPI application! Use the /health endpoint to check health, and /api/query for processing queries."}
 
372
  async def chat_endpoint(chat_query: ChatQuery):
373
  try:
374
  query_text = chat_query.query
375
+ query_embedding = embed_query_text(query_text)
376
+ initial_results = query_embeddings(query_embedding, embeddings_data, n_results=5)
377
+ document_ids = [doc_id for doc_id, _ in initial_results]
378
+ document_texts = retrieve_document_texts(document_ids, folder_path)
379
+ flattened_relevant_portions = []
380
+ for doc_id, portions in relevant_portions.items():
381
+ flattened_relevant_portions.extend(portions)
382
+ unique_selected_parts = remove_duplicates(flattened_relevant_portions)
383
+ combined_parts = " ".join(unique_selected_parts)
384
  context = [query_text] + unique_selected_parts
385
+ entities = extract_entities(query_text)
386
+ passage = enhance_passage_with_entities(combined_parts, entities)
387
+ prompt = create_prompt(query_text, passage)
388
+ answer, generation_time = generate_answer(prompt)
389
+ answer_part = answer.split("Answer:")[-1].strip()
390
+ cleaned_answer = remove_answer_prefix(answer_part)
391
+ final_answer = remove_incomplete_sentence(cleaned_answer)
392
 
393
  return {
394
+ "response": final_answer,
395
  "conversation_id": chat_query.conversation_id,
396
  "success": True
397
  }