Spaces:

thechaiexperiment
/

TeaRAG

Sleeping

App Files Files Community

thechaiexperiment commited on Jan 28

Commit

9b08b8e

verified ·

1 Parent(s): f9e3554

Update app.py

Browse files

Files changed (1) hide show

app.py +136 -239

app.py CHANGED Viewed

@@ -340,159 +340,117 @@ def retrieve_metadata(document_indices: List[int], metadata_path: str = 'recipes
         print(f"Error retrieving metadata: {e}")
         return {}
-def rerank_documents(query: str, document_ids: List[str], document_texts: List[str], cross_encoder_model) -> List[Tuple[float, str, str]]:
     try:
-        # Batch process all documents at once
         pairs = [(query, doc) for doc in document_texts]
-        scores = cross_encoder_model.predict(pairs, batch_size=8)  # Increased batch size
         scored_documents = list(zip(scores, document_ids, document_texts))
         scored_documents.sort(key=lambda x: x[0], reverse=True)
         return scored_documents
     except Exception as e:
         print(f"Error reranking documents: {e}")
         return []
-def extract_entities_batch(texts: List[str], biobert_tokenizer, biobert_model, batch_size: int = 8) -> List[List[str]]:
-    try:
-        all_entities = []
-        for i in range(0, len(texts), batch_size):
-            batch_texts = texts[i:i + batch_size]
-            # Process multiple texts in parallel
-            inputs = biobert_tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
-            with torch.no_grad():  # Disable gradient calculation
-                outputs = biobert_model(**inputs)
-            predictions = torch.argmax(outputs.logits, dim=2)
-            for j, (input_ids, preds) in enumerate(zip(inputs.input_ids, predictions)):
-                tokens = biobert_tokenizer.convert_ids_to_tokens(input_ids)
-                entities = [tokens[k] for k in range(len(tokens)) if preds[k].item() != 0]
-                all_entities.append(entities)
-        return all_entities
-    except Exception as e:
-        print(f"Error in batch entity extraction: {e}")
-        return [[] for _ in texts]
-def extract_relevant_portions(document_texts: List[str], query: str, biobert_tokenizer, biobert_model,
-                            max_portions: int = 3, portion_size: int = 1) -> Dict[str, List[str]]:
     try:
-        # Process query and all documents in one batch
-        all_texts = [query] + document_texts
-        all_entities = extract_entities_batch(all_texts, biobert_tokenizer, biobert_model)
-        query_entities = set(all_entities[0])
         relevant_portions = {}
-        def process_document(doc_idx: int) -> Tuple[str, List[str]]:
-            doc_text = document_texts[doc_idx]
-            doc_entities = set(all_entities[doc_idx + 1])  # +1 because query was first
-            sentences = nltk.sent_tokenize(doc_text)
-            doc_relevant_portions = []
-            # Score sentences based on entity overlap
-            sentence_scores = []
-            for i, sentence in enumerate(sentences):
-                entity_overlap = len(query_entities.intersection(doc_entities))
-                sentence_scores.append((entity_overlap, i))
-            # Sort and select top sentences
-            sentence_scores.sort(reverse=True)
-            for _, sent_idx in sentence_scores[:max_portions]:
-                start_idx = max(0, sent_idx - portion_size // 2)
-                end_idx = min(len(sentences), sent_idx + portion_size // 2 + 1)
-                portion = " ".join(sentences[start_idx:end_idx])
-                doc_relevant_portions.append(portion)
-            return f"Document_{doc_idx}", doc_relevant_portions
-        # Process documents in parallel
-        with ThreadPoolExecutor(max_workers=4) as executor:
-            results = list(executor.map(lambda x: process_document(x), range(len(document_texts))))
-        relevant_portions = dict(results)
         return relevant_portions
     except Exception as e:
-        print(f"Error extracting relevant portions: {e}")
-        return {f"Document_{i}": [] for i in range(len(document_texts))}
-def generate_answer(prompt: str, tokenizer_f, model_f, max_length: int = 860, temperature: float = 0.2) -> str:
     try:
-        # Optimize input processing
-        inputs = tokenizer_f(prompt, return_tensors="pt", truncation=True, max_length=512)
-        with torch.no_grad():  # Disable gradient calculation
-            output_ids = model_f.generate(
-                inputs.input_ids,
-                max_length=max_length,
-                num_return_sequences=1,
-                temperature=temperature,
-                pad_token_id=tokenizer_f.eos_token_id,
-                do_sample=False,  # Use greedy decoding for faster generation
-                early_stopping=True
-            )
-        answer = tokenizer_f.decode(output_ids[0], skip_special_tokens=True)
-        # Quick relevance check
-        if any(word in answer.lower() for word in prompt.lower().split()):
-            return answer
-        return "I apologize, but I cannot provide a relevant answer based on the given information."
     except Exception as e:
-        print(f"Error generating answer: {e}")
-        return "I apologize, but I encountered an error while generating the answer."
 def enhance_passage_with_entities(passage, entities):
     return f"{passage}\n\nEntities: {', '.join(entities)}"
-def create_prompt(question: str, passage: str) -> str:
-    return f"""As a medical expert, answer the following question based only on the provided passage. Be concise and direct.
-Passage: {passage}
-Question: {question}
-Answer:"""
-def process_query_and_generate_answer(
-    query: str,
-    relevant_documents: List[Tuple[float, str, str]],
-    models: Dict,
-    max_portions: int = 3
-) -> str:
-    try:
-        # Extract relevant portions from top documents
-        relevant_portions = extract_relevant_portions(
-            [doc[2] for doc in relevant_documents[:3]],  # Use top 3 documents
-            query,
-            models['bio_tokenizer'],
-            models['bio_model'],
-            max_portions=max_portions
-        )
-        # Combine relevant portions
-        all_portions = []
-        for doc_portions in relevant_portions.values():
-            all_portions.extend(doc_portions)
-        # Remove duplicates while preserving order
-        unique_portions = list(dict.fromkeys(all_portions))
-        # Create context from unique portions
-        context = " ".join(unique_portions[:max_portions])
-        # Generate and return answer
-        prompt = create_prompt(query, context)
-        return generate_answer(
-            prompt,
-            models['llm_tokenizer'],
-            models['llm_model']
-        )
-    except Exception as e:
-        print(f"Error in query processing pipeline: {e}")
-        return "I apologize, but I encountered an error while processing your question."
 def remove_answer_prefix(text):
     prefix = "Answer:"
     if prefix in text:
@@ -558,132 +516,71 @@ async def health_check():
 @app.post("/api/chat")
 async def chat_endpoint(chat_query: ChatQuery):
     try:
-        # Initialize response timing
-        start_time = asyncio.get_event_loop().time()
-        # Extract query and handle translation
         query_text = chat_query.query
         language_code = chat_query.language_code
         if language_code == 0:
-            query_text = await run_in_threadpool(translate_ar_to_en, query_text)
-        # Embed query and load embeddings in parallel
-        query_embedding_task = run_in_threadpool(embed_query_text, query_text)
-        embeddings_data_task = run_in_threadpool(load_embeddings)
-        # Wait for both tasks to complete
-        query_embedding, embeddings_data = await asyncio.gather(
-            query_embedding_task,
-            embeddings_data_task
-        )
-        # Initial document retrieval
         n_results = 5
         folder_path = 'downloaded_articles/downloaded_articles'
-        # Get initial results and retrieve documents
-        initial_results = await run_in_threadpool(
-            query_embeddings,
-            query_embedding,
-            embeddings_data,
-            n_results
-        )
         document_ids = [doc_id for doc_id, *_ in initial_results]
-        document_texts = await run_in_threadpool(
-            retrieve_document_texts,
-            document_ids,
-            folder_path
-        )
-        # Rerank documents
         cross_encoder = models['cross_encoder']
-        scored_documents = await run_in_threadpool(
-            rerank_documents,
-            query_text,
-            document_ids,
-            document_texts,
-            cross_encoder
-        )
-        # Process documents and generate answer
-        async with asyncio.TaskGroup() as tg:
-            # Extract entities in parallel
-            entities_task = tg.create_task(
-                run_in_threadpool(
-                    extract_entities_batch,
-                    [query_text] + [doc[2] for doc in scored_documents[:3]],
-                    models['bio_tokenizer'],
-                    models['bio_model']
-                )
-            )
-            # Extract relevant portions
-            portions_task = tg.create_task(
-                run_in_threadpool(
-                    extract_relevant_portions,
-                    [doc[2] for doc in scored_documents[:3]],
-                    query_text,
-                    models['bio_tokenizer'],
-                    models['bio_model']
-                )
-            )
-        entities = (await entities_task)[0]  # First item is query entities
-        relevant_portions = await portions_task
-        # Flatten and process portions
-        flattened_portions = []
-        for doc_portions in relevant_portions.values():
-            flattened_portions.extend(doc_portions)
-        unique_selected_parts = list(dict.fromkeys(flattened_portions))
         combined_parts = " ".join(unique_selected_parts)
-        # Enhance passage and create prompt
         passage = enhance_passage_with_entities(combined_parts, entities)
         prompt = create_prompt(query_text, passage)
-        # Generate answer
-        answer = await run_in_threadpool(
-            generate_answer,
-            prompt,
-            models['llm_tokenizer'],
-            models['llm_model']
-        )
-        # Process answer
         answer_part = answer.split("Answer:")[-1].strip()
-        cleaned_answer = await run_in_threadpool(remove_answer_prefix, answer_part)
-        final_answer = await run_in_threadpool(remove_incomplete_sentence, cleaned_answer)
-        # Handle translation if needed
         if language_code == 0:
-            final_answer = await run_in_threadpool(translate_en_to_ar, final_answer)
-        # Calculate response time
-        end_time = asyncio.get_event_loop().time()
-        response_time = end_time - start_time
         if final_answer:
-            print(f"Answer generated in {response_time:.2f} seconds")
             print(final_answer)
-            return {
-                "response": f"I hope this answers your question: {final_answer}",
-                "success": True,
-                "response_time": response_time
-            }
         else:
-            return {
-                "response": "Sorry, I can't help with that.",
-                "success": False,
-                "response_time": response_time
-            }
     except Exception as e:
-        print(f"Error in chat endpoint: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.post("/api/resources")

         print(f"Error retrieving metadata: {e}")
         return {}
+def rerank_documents(query, document_ids, document_texts, cross_encoder_model):
     try:
         pairs = [(query, doc) for doc in document_texts]
+        scores = cross_encoder_model.predict(pairs)
         scored_documents = list(zip(scores, document_ids, document_texts))
         scored_documents.sort(key=lambda x: x[0], reverse=True)
+        print("Reranked results:")
+        for idx, (score, doc_id, doc) in enumerate(scored_documents):
+            print(f"Rank {idx + 1} (Score: {score:.4f}, Document ID: {doc_id})")
         return scored_documents
     except Exception as e:
         print(f"Error reranking documents: {e}")
         return []
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+import nltk
+def extract_relevant_portions(query_embedding, top_documents, embeddings_data, max_portions=3):
     try:
         relevant_portions = {}
+        for _, doc_id, doc_text in top_documents:
+            if doc_id not in embeddings_data:
+                print(f"Warning: No embedding available for Document ID {doc_id}. Skipping...")
+                continue
+            # Retrieve the precomputed embedding for this document
+            doc_embedding = np.array(embeddings_data[doc_id])
+            # Compute similarity between the query embedding and the document embedding
+            similarity = cosine_similarity(query_embedding, [doc_embedding]).flatten()[0]
+            # Split the document into sentences
+            sentences = nltk.sent_tokenize(doc_text)
+            # Rank sentences based on their length (proxy for importance) or other heuristic
+            # Since we're using document-level embeddings, we assume all sentences are equally relevant.
+            sorted_sentences = sorted(sentences, key=lambda x: len(x), reverse=True)[:max_portions]
+            relevant_portions[doc_id] = sorted_sentences
+            print(f"Extracted relevant portions for Document ID {doc_id} (Similarity: {similarity:.4f}):")
+            for i, sentence in enumerate(sorted_sentences, start=1):
+                print(f"  Portion {i}: {sentence[:100]}...")  # Print first 100 characters for preview
         return relevant_portions
     except Exception as e:
+        print(f"Error in extract_relevant_portions: {e}")
+        return {}
+def remove_duplicates(selected_parts):
+    unique_sentences = set()
+    unique_selected_parts = []
+    for sentence in selected_parts:
+        if sentence not in unique_sentences:
+            unique_selected_parts.append(sentence)
+            unique_sentences.add(sentence)
+    return unique_selected_parts
+def extract_entities(text):
     try:
+        biobert_tokenizer = models['bio_tokenizer']
+        biobert_model = models['bio_model']
+        inputs = biobert_tokenizer(text, return_tensors="pt")
+        outputs = biobert_model(**inputs)
+        predictions = torch.argmax(outputs.logits, dim=2)
+        tokens = biobert_tokenizer.convert_ids_to_tokens(inputs.input_ids[0])
+        entities = [
+            tokens[i]
+            for i in range(len(tokens))
+            if predictions[0][i].item() != 0  # Assuming 0 is the label for non-entity
+        ]
+        return entities
     except Exception as e:
+        print(f"Error extracting entities: {e}")
+        return []
 def enhance_passage_with_entities(passage, entities):
     return f"{passage}\n\nEntities: {', '.join(entities)}"
+def create_prompt(question, passage):
+    prompt = ("""
+    As a medical expert, you are required to answer the following question based only on the provided passage. Do not include any information not present in the passage. Your response should directly reflect the content of the passage. Maintain accuracy and relevance to the provided information.
+    Passage: {passage}
+    Question: {question}
+    Answer:
+    """)
+    return prompt.format(passage=passage, question=question)
+def generate_answer(prompt, max_length=860, temperature=0.2):
+    tokenizer_f = models['llm_tokenizer']
+    model_f = models['llm_model']
+    inputs = tokenizer_f(prompt, return_tensors="pt", truncation=True)
+    output_ids = model_f.generate(
+        inputs.input_ids,
+        max_length=max_length,
+        num_return_sequences=1,
+        temperature=temperature,
+        pad_token_id=tokenizer_f.eos_token_id
+    )
+    answer = tokenizer_f.decode(output_ids[0], skip_special_tokens=True)
+    passage_keywords = set(prompt.lower().split())
+    answer_keywords = set(answer.lower().split())
+    if passage_keywords.intersection(answer_keywords):
+        return answer
+    else:
+        return "Sorry, I can't help with that."
 def remove_answer_prefix(text):
     prefix = "Answer:"
     if prefix in text:
 @app.post("/api/chat")
 async def chat_endpoint(chat_query: ChatQuery):
     try:
         query_text = chat_query.query
         language_code = chat_query.language_code
+        # Translate Arabic to English if language_code is 0
         if language_code == 0:
+            query_text = translate_ar_to_en(query_text)
+        # Generate query embedding
+        query_embedding = embed_query_text(query_text)
         n_results = 5
+        # Load embeddings and retrieve initial results
+        embeddings_data = load_embeddings()
         folder_path = 'downloaded_articles/downloaded_articles'
+        initial_results = query_embeddings(query_embedding, embeddings_data, n_results)
+        # Extract document IDs and texts
         document_ids = [doc_id for doc_id, *_ in initial_results]
+        document_texts = retrieve_document_texts(document_ids, folder_path)
+        # Use cross-encoder to score documents
         cross_encoder = models['cross_encoder']
+        scores = cross_encoder.predict([(query_text, doc) for doc in document_texts])
+        # Score and sort documents
+        scored_documents = list(zip(scores, document_ids, document_texts))
+        scored_documents.sort(key=lambda x: x[0], reverse=True)
+        # Extract relevant portions
+        relevant_portions = extract_relevant_portions(query_embedding, scored_documents, embeddings_data, max_portions=3)
+        unique_selected_parts = remove_duplicates(relevant_portions)
         combined_parts = " ".join(unique_selected_parts)
+        # Build context and enhance passage with entities
+        context = [query_text] + unique_selected_parts
+        entities = extract_entities(query_text)
         passage = enhance_passage_with_entities(combined_parts, entities)
+        # Create prompt and generate answer
         prompt = create_prompt(query_text, passage)
+        answer = generate_answer(prompt)
         answer_part = answer.split("Answer:")[-1].strip()
+        # Clean and finalize the answer
+        cleaned_answer = remove_answer_prefix(answer_part)
+        final_answer = remove_incomplete_sentence(cleaned_answer)
+        # Translate English back to Arabic if needed
         if language_code == 0:
+            final_answer = translate_en_to_ar(final_answer)
+        # Print and return the answer
         if final_answer:
+            print("Answer:")
             print(final_answer)
         else:
+            print("Sorry, I can't help with that.")
+        return {
+            "response": f"I hope this answers your question: {final_answer}",
+            # "conversation_id": chat_query.conversation_id,  # Uncomment if needed
+            "success": True
+        }
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.post("/api/resources")