Spaces:

JirasakJo
/

Questions_Graduate_Studies_Calendar_2024

Sleeping

App Files Files Community

JirasakJo commited on Mar 2

Commit

798cabe

verified ·

1 Parent(s): d42b14e

Update calendar_rag.py

Browse files

Files changed (1) hide show

calendar_rag.py +113 -17

calendar_rag.py CHANGED Viewed

@@ -5,6 +5,7 @@ from haystack.components.embedders import SentenceTransformersDocumentEmbedder
 from haystack.components.retrievers.in_memory import *
 from haystack.document_stores.in_memory import InMemoryDocumentStore
 from haystack.utils import Secret
 from pathlib import Path
 import hashlib
 from datetime import *
@@ -118,7 +119,17 @@ class TuitionFee:
     event_type: str
     regular_fee: RegularFee
     late_payment_fee: LatePaymentFee
 class OpenAIDateParser:
     """Uses OpenAI to parse complex Thai date formats"""
@@ -414,17 +425,21 @@ class CacheManager:
         self.document_cache[doc_id] = (document, datetime.now())
         self._save_cache("documents", self.document_cache)
 @dataclass
 class ModelConfig:
     openai_api_key: str
-    # embedder_model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
-    embedder_model: str = "sentence-transformers/mUSE"
     openai_model: str = "gpt-4o"
     temperature: float = 0.7
 @dataclass
 class RetrieverConfig:
     top_k: int = 5
 @dataclass
 class CacheConfig:
@@ -450,7 +465,8 @@ class PipelineConfig:
 def create_default_config(api_key: str) -> PipelineConfig:
     """
-    Create a default pipeline configuration with optimized settings for Thai language processing.
     Args:
         api_key (str): OpenAI API key
@@ -461,10 +477,14 @@ def create_default_config(api_key: str) -> PipelineConfig:
     return PipelineConfig(
         model=ModelConfig(
             openai_api_key=api_key,
             temperature=0.3  # Lower temperature for more focused responses
         ),
         retriever=RetrieverConfig(
-            top_k=5  # Optimal number of documents to retrieve
         ),
         cache=CacheConfig(
             enabled=True,
@@ -1278,6 +1298,68 @@ class HybridDocumentStore:
         )
         return sorted_docs[:top_k]
 class ResponseGenerator:
     """Generate responses with enhanced conversation context awareness"""
@@ -1541,7 +1623,7 @@ class AdvancedQueryProcessor:
 # First, let's modify the AcademicCalendarRAG class to maintain conversation history
 class AcademicCalendarRAG:
-    """Enhanced RAG system for academic calendar and program information with conversation memory"""
     def __init__(self, config: PipelineConfig):
         self.config = config
@@ -1599,7 +1681,7 @@ class AcademicCalendarRAG:
             raise
     def process_query(self, query: str, conversation_history=None) -> Dict[str, Any]:
-        """Process user query using conversation history and hybrid retrieval."""
         # Use provided conversation history or the internal history
         if conversation_history is not None:
             self.conversation_history = conversation_history
@@ -1635,16 +1717,30 @@ class AcademicCalendarRAG:
                 weight_semantic = weight_values[attempt - 1]
-                # Get relevant documents using hybrid search
                 logger.info(f"Attempt {attempt}: Searching with weight_semantic={weight_semantic}")
-                documents = self.document_store.hybrid_search(
-                    query=query_with_context if attempt == 1 else query,
-                    event_type=query_info.get("event_type"),
-                    detail_type=query_info.get("detail_type"),
-                    semester=query_info.get("semester"),
-                    top_k=self.config.retriever.top_k,
-                    weight_semantic=weight_semantic
-                )
                 # Generate response with conversation context
                 response = self.response_generator.generate_response(
@@ -1695,8 +1791,8 @@ class AcademicCalendarRAG:
 #         pipeline.load_data(raw_data)
 #         # Test queries with different semantic weights
-#         # queries = ["ค่าเทอมเท่าไหร่","เปิดเรียนวันไหน","ขั้นตอนการสมัครที่สาขานี้มีอะไรบ้าง","ต้องใช้ระดับภาษาอังกฤษเท่าไหร่ในการสมัครเรียนที่นี้","ถ้าจะไปติดต่อมาหลายต้องลง mrt อะไร","มีวิชาหลักเเละวิชาเลือกออะไรบ้าง", "ปีที่ 1 เทอม 1 ต้องเรียนอะไรบ้าง", "ปีที่ 2 เทอม 1 ต้องเรียนอะไรบ้าง"]
-#         queries = ["ปีที่ 1 เทอม 1 ต้องเรียนอะไรบ้าง"]
 #         print("=" * 80)
 #         for query in queries:

 from haystack.components.retrievers.in_memory import *
 from haystack.document_stores.in_memory import InMemoryDocumentStore
 from haystack.utils import Secret
+from sentence_transformers import CrossEncoder
 from pathlib import Path
 import hashlib
 from datetime import *
     event_type: str
     regular_fee: RegularFee
     late_payment_fee: LatePaymentFee
+class SentenceTransformersCrossEncoder:
+    """Wrapper for the sentence-transformers CrossEncoder for compatibility with the existing code"""
+    def __init__(self, model_name_or_path: str):
+        """Initialize the cross-encoder model"""
+        self.model = CrossEncoder(model_name_or_path)
+    def predict(self, sentence_pairs: List[Tuple[str, str]]) -> List[float]:
+        """Predict relevance scores for sentence pairs"""
+        return self.model.predict(sentence_pairs)
 class OpenAIDateParser:
     """Uses OpenAI to parse complex Thai date formats"""
         self.document_cache[doc_id] = (document, datetime.now())
         self._save_cache("documents", self.document_cache)
+@dataclass
 @dataclass
 class ModelConfig:
     openai_api_key: str
+    embedder_model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
     openai_model: str = "gpt-4o"
     temperature: float = 0.7
+    reranker_model: str = "cross-encoder/mmarco-mMiniLMv2-L12-H384-v1"  # Add this
 @dataclass
 class RetrieverConfig:
     top_k: int = 5
+    use_reranking: bool = True  # Add this flag
+    top_k_initial: int = 20     # Add this parameter
+    top_k_final: int = 5        # Add this parameter
 @dataclass
 class CacheConfig:
 def create_default_config(api_key: str) -> PipelineConfig:
     """
+    Create a default pipeline configuration with optimized settings for Thai language processing,
+    including reranking capabilities.
     Args:
         api_key (str): OpenAI API key
     return PipelineConfig(
         model=ModelConfig(
             openai_api_key=api_key,
+            embedder_model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
             temperature=0.3  # Lower temperature for more focused responses
         ),
         retriever=RetrieverConfig(
+            top_k=5,           # Optimal number of documents to retrieve
+            use_reranking=True,  # Enable reranking
+            top_k_initial=20,    # Retrieve more initial documents for reranking
+            top_k_final=5        # Final number of documents after reranking
         ),
         cache=CacheConfig(
             enabled=True,
         )
         return sorted_docs[:top_k]
+    def search_with_reranking(self,
+                             query: str,
+                             event_type: Optional[str] = None,
+                             detail_type: Optional[str] = None,
+                             semester: Optional[str] = None,
+                             top_k_initial: int = 20,
+                             top_k_final: int = 5,
+                             weight_semantic: float = 0.5) -> List[Document]:
+        """
+        Two-stage retrieval with hybrid search followed by cross-encoder reranking
+        """
+        # Generate cache key for the reranked query
+        cache_key = json.dumps({
+            'query': query,
+            'event_type': event_type,
+            'semester': semester,
+            'top_k_initial': top_k_initial,
+            'top_k_final': top_k_final,
+            'weight_semantic': weight_semantic,
+            'reranked': True  # Indicate this is a reranked query
+        })
+        # Check cache first
+        cached_results = self.cache_manager.get_query_cache(cache_key)
+        if cached_results is not None:
+            return cached_results
+        # 1. Get larger initial result set
+        initial_results = self.hybrid_search(
+            query=query,
+            event_type=event_type,
+            detail_type=detail_type,
+            semester=semester,
+            top_k=top_k_initial,
+            weight_semantic=weight_semantic
+        )
+        # If we don't have enough initial results, just return what we have
+        if len(initial_results) <= top_k_final:
+            return initial_results
+        try:
+            # We'll lazily initialize the cross encoder to save memory
+            cross_encoder = SentenceTransformersCrossEncoder("cross-encoder/mmarco-mMiniLMv2-L12-H384-v1")
+            pairs = [(query, doc.content) for doc in initial_results]
+            scores = cross_encoder.predict(pairs)
+            for doc, score in zip(initial_results, scores):
+                doc.score = float(score)  # Ensure score is a regular float
+            reranked_results = sorted(initial_results, key=lambda x: x.score, reverse=True)[:top_k_final]
+            # Cache the results
+            self.cache_manager.set_query_cache(cache_key, reranked_results)
+            return reranked_results
+        except Exception as e:
+            logger.error(f"Reranking failed: {str(e)}. Falling back to hybrid search results.")
+            return initial_results[:top_k_final]
 class ResponseGenerator:
     """Generate responses with enhanced conversation context awareness"""
 # First, let's modify the AcademicCalendarRAG class to maintain conversation history
 class AcademicCalendarRAG:
+    """Enhanced RAG system for academic calendar and program information with conversation memory and reranking"""
     def __init__(self, config: PipelineConfig):
         self.config = config
             raise
     def process_query(self, query: str, conversation_history=None) -> Dict[str, Any]:
+        """Process user query using conversation history and hybrid retrieval with reranking."""
         # Use provided conversation history or the internal history
         if conversation_history is not None:
             self.conversation_history = conversation_history
                 weight_semantic = weight_values[attempt - 1]
+                # Get relevant documents using reranking if enabled
                 logger.info(f"Attempt {attempt}: Searching with weight_semantic={weight_semantic}")
+                if self.config.retriever.use_reranking:
+                    documents = self.document_store.search_with_reranking(
+                        query=query_with_context if attempt == 1 else query,
+                        event_type=query_info.get("event_type"),
+                        detail_type=query_info.get("detail_type"),
+                        semester=query_info.get("semester"),
+                        top_k_initial=self.config.retriever.top_k_initial,
+                        top_k_final=self.config.retriever.top_k_final,
+                        weight_semantic=weight_semantic
+                    )
+                    logger.info(f"Using reranking for retrieval, got {len(documents)} documents")
+                else:
+                    documents = self.document_store.hybrid_search(
+                        query=query_with_context if attempt == 1 else query,
+                        event_type=query_info.get("event_type"),
+                        detail_type=query_info.get("detail_type"),
+                        semester=query_info.get("semester"),
+                        top_k=self.config.retriever.top_k,
+                        weight_semantic=weight_semantic
+                    )
+                    logger.info(f"Using standard hybrid search, got {len(documents)} documents")
                 # Generate response with conversation context
                 response = self.response_generator.generate_response(
 #         pipeline.load_data(raw_data)
 #         # Test queries with different semantic weights
+#         queries = ["ค่าเทอมเท่าไหร่","เปิดเรียนวันไหน","ขั้นตอนการสมัครที่สาขานี้มีอะไรบ้าง","ต้องใช้ระดับภาษาอังกฤษเท่าไหร่ในการสมัครเรียนที่นี้","ถ้าจะไปติดต่อมาหลายต้องลง mrt อะไร","มีวิชาหลักเเละวิชาเลือกออะไรบ้าง", "ปีที่ 1 เทอม 1 ต้องเรียนอะไรบ้าง", "ปีที่ 2 เทอม 1 ต้องเรียนอะไรบ้าง"]
+#         # queries = ["ต้องใช้ระดับภาษาอังกฤษเท่าไหร่ในการสมัครเรียนที่นี้"]
 #         print("=" * 80)
 #         for query in queries: