Spaces:

thechaiexperiment
/

TeaRAG

Sleeping

App Files Files Community

thechaiexperiment commited on Jan 28

Commit

6e8f9d4

verified ·

1 Parent(s): 6e0f0b6

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -55

app.py CHANGED Viewed

@@ -346,52 +346,45 @@ def rerank_documents(query, document_ids, document_texts, cross_encoder_model):
         print(f"Error reranking documents: {e}")
         return []
-def extract_entities(text, ner_pipeline=None):
-    try:
-        if ner_pipeline is None:
-            ner_pipeline = models['ner_pipeline']
-        ner_results = ner_pipeline(text)
-        entities = {result['word'] for result in ner_results if result['entity'].startswith("B-")}
-        return list(entities)
-    except Exception as e:
-        print(f"Error extracting entities: {e}")
-        return []
-def match_entities(query_entities, sentence_entities):
     try:
-        query_set, sentence_set = set(query_entities), set(sentence_entities)
-        matches = query_set.intersection(sentence_set)
-        return len(matches)
     except Exception as e:
-        print(f"Error matching entities: {e}")
-        return 0
-def extract_relevant_portions(document_texts, query, max_portions=3, portion_size=1, min_query_words=2):
-    relevant_portions = {}
-    query_entities = extract_entities(query)
-    print(f"Extracted Query Entities: {query_entities}")
-    for doc_id, doc_text in enumerate(document_texts):
-        sentences = nltk.sent_tokenize(doc_text)
-        doc_relevant_portions = []
-        doc_entities = extract_entities(doc_text)
-        print(f"Document {doc_id} Entities: {doc_entities}")
-        for i, sentence in enumerate(sentences):
-            sentence_entities = extract_entities(sentence)
-            relevance_score = match_entities(query_entities, sentence_entities)
-            if relevance_score >= min_query_words:
-                start_idx = max(0, i - portion_size // 2)
-                end_idx = min(len(sentences), i + portion_size // 2 + 1)
-                portion = " ".join(sentences[start_idx:end_idx])
-                doc_relevant_portions.append(portion)
-            if len(doc_relevant_portions) >= max_portions:
-                break
-        if not doc_relevant_portions and len(doc_entities) > 0:
-            print(f"Fallback: Selecting sentences with most entities for Document {doc_id}")
-            sorted_sentences = sorted(sentences, key=lambda s: len(extract_entities(s, ner_biobert)), reverse=True)
-            for fallback_sentence in sorted_sentences[:max_portions]:
-                doc_relevant_portions.append(fallback_sentence)
-        relevant_portions[f"Document_{doc_id}"] = doc_relevant_portions
-    return relevant_portions
 def remove_duplicates(selected_parts):
     unique_sentences = set()
@@ -426,11 +419,8 @@ def enhance_passage_with_entities(passage, entities):
 def create_prompt(question, passage):
     prompt = ("""
     As a medical expert, you are required to answer the following question based only on the provided passage. Do not include any information not present in the passage. Your response should directly reflect the content of the passage. Maintain accuracy and relevance to the provided information.
     Passage: {passage}
     Question: {question}
     Answer:
     """)
     return prompt.format(passage=passage, question=question)
@@ -520,46 +510,69 @@ async def health_check():
 async def chat_endpoint(chat_query: ChatQuery):
     try:
         query_text = chat_query.query
-        language_code = chat_query.language_code
         if language_code == 0:
             query_text = translate_ar_to_en(query_text)
         query_embedding = embed_query_text(query_text)
         n_results = 5
-        embeddings_data = load_embeddings ()
         folder_path = 'downloaded_articles/downloaded_articles'
         initial_results = query_embeddings(query_embedding, embeddings_data, n_results)
-        document_ids = [doc_id for doc_id, _ in initial_results]
         document_texts = retrieve_document_texts(document_ids, folder_path)
         cross_encoder = models['cross_encoder']
         scores = cross_encoder.predict([(query_text, doc) for doc in document_texts])
         scored_documents = list(zip(scores, document_ids, document_texts))
         scored_documents.sort(key=lambda x: x[0], reverse=True)
-        relevant_portions = extract_relevant_portions(document_texts, query_text, max_portions=3, portion_size=1, min_query_words=2)
-        flattened_relevant_portions = []
-        for doc_id, portions in relevant_portions.items():
-            flattened_relevant_portions.extend(portions)
-        unique_selected_parts = remove_duplicates(flattened_relevant_portions)
         combined_parts = " ".join(unique_selected_parts)
         context = [query_text] + unique_selected_parts
         entities = extract_entities(query_text)
         passage = enhance_passage_with_entities(combined_parts, entities)
         prompt = create_prompt(query_text, passage)
         answer = generate_answer(prompt)
         answer_part = answer.split("Answer:")[-1].strip()
         cleaned_answer = remove_answer_prefix(answer_part)
         final_answer = remove_incomplete_sentence(cleaned_answer)
         if language_code == 0:
             final_answer = translate_en_to_ar(final_answer)
         if final_answer:
             print("Answer:")
             print(final_answer)
         else:
             print("Sorry, I can't help with that.")
         return {
             "response": f"I hope this answers your question: {final_answer}",
-            # "conversation_id": chat_query.conversation_id,
             "success": True
         }
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))

         print(f"Error reranking documents: {e}")
         return []
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+import nltk
+def extract_relevant_portions(query_embedding, top_documents, embeddings_data, max_portions=3):
     try:
+        relevant_portions = {}
+        for _, doc_id, doc_text in top_documents:
+            if doc_id not in embeddings_data:
+                print(f"Warning: No embedding available for Document ID {doc_id}. Skipping...")
+                continue
+            # Retrieve the precomputed embedding for this document
+            doc_embedding = np.array(embeddings_data[doc_id])
+            # Compute similarity between the query embedding and the document embedding
+            similarity = cosine_similarity(query_embedding, [doc_embedding]).flatten()[0]
+            # Split the document into sentences
+            sentences = nltk.sent_tokenize(doc_text)
+            # Rank sentences based on their length (proxy for importance) or other heuristic
+            # Since we're using document-level embeddings, we assume all sentences are equally relevant.
+            sorted_sentences = sorted(sentences, key=lambda x: len(x), reverse=True)[:max_portions]
+            relevant_portions[doc_id] = sorted_sentences
+            print(f"Extracted relevant portions for Document ID {doc_id} (Similarity: {similarity:.4f}):")
+            for i, sentence in enumerate(sorted_sentences, start=1):
+                print(f"  Portion {i}: {sentence[:100]}...")  # Print first 100 characters for preview
+        return relevant_portions
     except Exception as e:
+        print(f"Error in extract_relevant_portions: {e}")
+        return {}
 def remove_duplicates(selected_parts):
     unique_sentences = set()
 def create_prompt(question, passage):
     prompt = ("""
     As a medical expert, you are required to answer the following question based only on the provided passage. Do not include any information not present in the passage. Your response should directly reflect the content of the passage. Maintain accuracy and relevance to the provided information.
     Passage: {passage}
     Question: {question}
     Answer:
     """)
     return prompt.format(passage=passage, question=question)
 async def chat_endpoint(chat_query: ChatQuery):
     try:
         query_text = chat_query.query
+        language_code = chat_query.language_code
+        # Translate Arabic to English if language_code is 0
         if language_code == 0:
             query_text = translate_ar_to_en(query_text)
+        # Generate query embedding
         query_embedding = embed_query_text(query_text)
         n_results = 5
+        # Load embeddings and retrieve initial results
+        embeddings_data = load_embeddings()
         folder_path = 'downloaded_articles/downloaded_articles'
         initial_results = query_embeddings(query_embedding, embeddings_data, n_results)
+        # Extract document IDs and texts
+        document_ids = [doc_id for doc_id, *_ in initial_results]
         document_texts = retrieve_document_texts(document_ids, folder_path)
+        # Use cross-encoder to score documents
         cross_encoder = models['cross_encoder']
         scores = cross_encoder.predict([(query_text, doc) for doc in document_texts])
+        # Score and sort documents
         scored_documents = list(zip(scores, document_ids, document_texts))
         scored_documents.sort(key=lambda x: x[0], reverse=True)
+        # Extract relevant portions
+        relevant_portions = extract_relevant_portions(query_embedding, scored_documents, embeddings_data, max_portions=3)
+        unique_selected_parts = remove_duplicates(relevant_portions)
         combined_parts = " ".join(unique_selected_parts)
+        # Build context and enhance passage with entities
         context = [query_text] + unique_selected_parts
         entities = extract_entities(query_text)
         passage = enhance_passage_with_entities(combined_parts, entities)
+        # Create prompt and generate answer
         prompt = create_prompt(query_text, passage)
         answer = generate_answer(prompt)
         answer_part = answer.split("Answer:")[-1].strip()
+        # Clean and finalize the answer
         cleaned_answer = remove_answer_prefix(answer_part)
         final_answer = remove_incomplete_sentence(cleaned_answer)
+        # Translate English back to Arabic if needed
         if language_code == 0:
             final_answer = translate_en_to_ar(final_answer)
+        # Print and return the answer
         if final_answer:
             print("Answer:")
             print(final_answer)
         else:
             print("Sorry, I can't help with that.")
         return {
             "response": f"I hope this answers your question: {final_answer}",
+            # "conversation_id": chat_query.conversation_id,  # Uncomment if needed
             "success": True
         }
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))