Spaces:

JirasakJo
/

Questions_Graduate_Studies_Calendar_2024

Sleeping

App Files Files Community

JirasakJo commited on Mar 2

Commit

b16b6fc

verified ·

1 Parent(s): 798cabe

Update calendar_rag.py

Browse files

Files changed (1) hide show

calendar_rag.py +23 -58

calendar_rag.py CHANGED Viewed

@@ -5,7 +5,6 @@ from haystack.components.embedders import SentenceTransformersDocumentEmbedder
 from haystack.components.retrievers.in_memory import *
 from haystack.document_stores.in_memory import InMemoryDocumentStore
 from haystack.utils import Secret
-from sentence_transformers import CrossEncoder
 from pathlib import Path
 import hashlib
 from datetime import *
@@ -119,17 +118,7 @@ class TuitionFee:
     event_type: str
     regular_fee: RegularFee
     late_payment_fee: LatePaymentFee
-class SentenceTransformersCrossEncoder:
-    """Wrapper for the sentence-transformers CrossEncoder for compatibility with the existing code"""
-    def __init__(self, model_name_or_path: str):
-        """Initialize the cross-encoder model"""
-        self.model = CrossEncoder(model_name_or_path)
-    def predict(self, sentence_pairs: List[Tuple[str, str]]) -> List[float]:
-        """Predict relevance scores for sentence pairs"""
-        return self.model.predict(sentence_pairs)
 class OpenAIDateParser:
     """Uses OpenAI to parse complex Thai date formats"""
@@ -425,21 +414,16 @@ class CacheManager:
         self.document_cache[doc_id] = (document, datetime.now())
         self._save_cache("documents", self.document_cache)
-@dataclass
 @dataclass
 class ModelConfig:
     openai_api_key: str
     embedder_model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
     openai_model: str = "gpt-4o"
     temperature: float = 0.7
-    reranker_model: str = "cross-encoder/mmarco-mMiniLMv2-L12-H384-v1"  # Add this
 @dataclass
 class RetrieverConfig:
     top_k: int = 5
-    use_reranking: bool = True  # Add this flag
-    top_k_initial: int = 20     # Add this parameter
-    top_k_final: int = 5        # Add this parameter
 @dataclass
 class CacheConfig:
@@ -465,8 +449,7 @@ class PipelineConfig:
 def create_default_config(api_key: str) -> PipelineConfig:
     """
-    Create a default pipeline configuration with optimized settings for Thai language processing,
-    including reranking capabilities.
     Args:
         api_key (str): OpenAI API key
@@ -477,14 +460,10 @@ def create_default_config(api_key: str) -> PipelineConfig:
     return PipelineConfig(
         model=ModelConfig(
             openai_api_key=api_key,
-            embedder_model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
             temperature=0.3  # Lower temperature for more focused responses
         ),
         retriever=RetrieverConfig(
-            top_k=5,           # Optimal number of documents to retrieve
-            use_reranking=True,  # Enable reranking
-            top_k_initial=20,    # Retrieve more initial documents for reranking
-            top_k_final=5        # Final number of documents after reranking
         ),
         cache=CacheConfig(
             enabled=True,
@@ -1300,13 +1279,13 @@ class HybridDocumentStore:
         return sorted_docs[:top_k]
     def search_with_reranking(self,
-                             query: str,
-                             event_type: Optional[str] = None,
-                             detail_type: Optional[str] = None,
-                             semester: Optional[str] = None,
-                             top_k_initial: int = 20,
-                             top_k_final: int = 5,
-                             weight_semantic: float = 0.5) -> List[Document]:
         """
         Two-stage retrieval with hybrid search followed by cross-encoder reranking
         """
@@ -1402,10 +1381,10 @@ class ResponseGenerator:
             5. สำหรับคำถามเกี่ยวกับข้อกำหนดภาษาอังกฤษหรือขั้นตอนการสมัคร ให้อธิบายข้อมูลอย่างละเอียด
             6. ใส่ข้อความ "หากมีข้อสงสัยเพิ่มเติม สามารถสอบถามได้" ท้ายคำตอบเสมอ
             7. คำนึงถึงประวัติการสนทนาและให้คำตอบที่ต่อเนื่องกับบทสนทนาก่อนหน้า
-            8. หากคำถามอ้างอิงถึงข้อมูลในบทสนทนาก่อนหน้า (เช่น "แล้วอันนั้นล่ะ", "มีอะไรอีกบ้าง", "คำถามก่อนหน้า") ใ���้พิจารณาบริบทและตอบคำถามอย่างตรงประเด็น
             9. กรณีคำถามมีความไม่ชัดเจน ใช้ประวัติการสนทนาเพื่อเข้าใจบริบทของคำถาม
-            สิ่งสำคัญพิเศษ: หากคำถามอ้างอิงถึงคำถามก่อนหน้า ให้แสดงคำถามก่อนหน้านั้นในคำตอบด้วย เช่น "คำถามก่อนหน้าคือ [คำถามก่อนหน้า] และคำตอบคือ..."
             กรุณาตอบเป็นภาษาไทย:
             """
@@ -1623,7 +1602,7 @@ class AdvancedQueryProcessor:
 # First, let's modify the AcademicCalendarRAG class to maintain conversation history
 class AcademicCalendarRAG:
-    """Enhanced RAG system for academic calendar and program information with conversation memory and reranking"""
     def __init__(self, config: PipelineConfig):
         self.config = config
@@ -1681,7 +1660,7 @@ class AcademicCalendarRAG:
             raise
     def process_query(self, query: str, conversation_history=None) -> Dict[str, Any]:
-        """Process user query using conversation history and hybrid retrieval with reranking."""
         # Use provided conversation history or the internal history
         if conversation_history is not None:
             self.conversation_history = conversation_history
@@ -1717,30 +1696,16 @@ class AcademicCalendarRAG:
                 weight_semantic = weight_values[attempt - 1]
-                # Get relevant documents using reranking if enabled
                 logger.info(f"Attempt {attempt}: Searching with weight_semantic={weight_semantic}")
-                if self.config.retriever.use_reranking:
-                    documents = self.document_store.search_with_reranking(
-                        query=query_with_context if attempt == 1 else query,
-                        event_type=query_info.get("event_type"),
-                        detail_type=query_info.get("detail_type"),
-                        semester=query_info.get("semester"),
-                        top_k_initial=self.config.retriever.top_k_initial,
-                        top_k_final=self.config.retriever.top_k_final,
-                        weight_semantic=weight_semantic
-                    )
-                    logger.info(f"Using reranking for retrieval, got {len(documents)} documents")
-                else:
-                    documents = self.document_store.hybrid_search(
-                        query=query_with_context if attempt == 1 else query,
-                        event_type=query_info.get("event_type"),
-                        detail_type=query_info.get("detail_type"),
-                        semester=query_info.get("semester"),
-                        top_k=self.config.retriever.top_k,
-                        weight_semantic=weight_semantic
-                    )
-                    logger.info(f"Using standard hybrid search, got {len(documents)} documents")
                 # Generate response with conversation context
                 response = self.response_generator.generate_response(
@@ -1792,7 +1757,7 @@ class AcademicCalendarRAG:
 #         # Test queries with different semantic weights
 #         queries = ["ค่าเทอมเท่าไหร่","เปิดเรียนวันไหน","ขั้นตอนการสมัครที่สาขานี้มีอะไรบ้าง","ต้องใช้ระดับภาษาอังกฤษเท่าไหร่ในการสมัครเรียนที่นี้","ถ้าจะไปติดต่อมาหลายต้องลง mrt อะไร","มีวิชาหลักเเละวิชาเลือกออะไรบ้าง", "ปีที่ 1 เทอม 1 ต้องเรียนอะไรบ้าง", "ปีที่ 2 เทอม 1 ต้องเรียนอะไรบ้าง"]
-#         # queries = ["ต้องใช้ระดับภาษาอังกฤษเท่าไหร่ในการสมัครเรียนที่นี้"]
 #         print("=" * 80)
 #         for query in queries:

 from haystack.components.retrievers.in_memory import *
 from haystack.document_stores.in_memory import InMemoryDocumentStore
 from haystack.utils import Secret
 from pathlib import Path
 import hashlib
 from datetime import *
     event_type: str
     regular_fee: RegularFee
     late_payment_fee: LatePaymentFee
 class OpenAIDateParser:
     """Uses OpenAI to parse complex Thai date formats"""
         self.document_cache[doc_id] = (document, datetime.now())
         self._save_cache("documents", self.document_cache)
 @dataclass
 class ModelConfig:
     openai_api_key: str
     embedder_model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
     openai_model: str = "gpt-4o"
     temperature: float = 0.7
 @dataclass
 class RetrieverConfig:
     top_k: int = 5
 @dataclass
 class CacheConfig:
 def create_default_config(api_key: str) -> PipelineConfig:
     """
+    Create a default pipeline configuration with optimized settings for Thai language processing.
     Args:
         api_key (str): OpenAI API key
     return PipelineConfig(
         model=ModelConfig(
             openai_api_key=api_key,
             temperature=0.3  # Lower temperature for more focused responses
         ),
         retriever=RetrieverConfig(
+            top_k=5  # Optimal number of documents to retrieve
         ),
         cache=CacheConfig(
             enabled=True,
         return sorted_docs[:top_k]
     def search_with_reranking(self,
+                            query: str,
+                            event_type: Optional[str] = None,
+                            detail_type: Optional[str] = None,
+                            semester: Optional[str] = None,
+                            top_k_initial: int = 20,
+                            top_k_final: int = 5,
+                            weight_semantic: float = 0.5) -> List[Document]:
         """
         Two-stage retrieval with hybrid search followed by cross-encoder reranking
         """
             5. สำหรับคำถามเกี่ยวกับข้อกำหนดภาษาอังกฤษหรือขั้นตอนการสมัคร ให้อธิบายข้อมูลอย่างละเอียด
             6. ใส่ข้อความ "หากมีข้อสงสัยเพิ่มเติม สามารถสอบถามได้" ท้ายคำตอบเสมอ
             7. คำนึงถึงประวัติการสนทนาและให้คำตอบที่ต่อเนื่องกับบทสนทนาก่อนหน้า
+            8. หากคำถามอ้างอิงถึงข้อมูลในบทสนทนาก่อนหน้า (เช่น "แล้วอันนั้นล่ะ", "มีอะไรอีกบ้าง", "คำถามก่อนหน้า") ให้พิจารณาบริบทและตอบคำถามอย่างตรงประเด็น แต่ไม่ต้องแสดงคำถามก่อนหน้าในคำตอบ
             9. กรณีคำถามมีความไม่ชัดเจน ใช้ประวัติการสนทนาเพื่อเข้าใจบริบทของคำถาม
+            สำคัญ: ไม่ต้องใส่คำว่า "คำถามก่อนหน้าคือ [คำถามก่อนหน้า] และคำตอบคือ..." ในคำตอบของคุณ ให้ตอบคำถามโดยตรง
             กรุณาตอบเป็นภาษาไทย:
             """
 # First, let's modify the AcademicCalendarRAG class to maintain conversation history
 class AcademicCalendarRAG:
+    """Enhanced RAG system for academic calendar and program information with conversation memory"""
     def __init__(self, config: PipelineConfig):
         self.config = config
             raise
     def process_query(self, query: str, conversation_history=None) -> Dict[str, Any]:
+        """Process user query using conversation history and hybrid retrieval."""
         # Use provided conversation history or the internal history
         if conversation_history is not None:
             self.conversation_history = conversation_history
                 weight_semantic = weight_values[attempt - 1]
+                # Get relevant documents using hybrid search
                 logger.info(f"Attempt {attempt}: Searching with weight_semantic={weight_semantic}")
+                documents = self.document_store.hybrid_search(
+                    query=query_with_context if attempt == 1 else query,
+                    event_type=query_info.get("event_type"),
+                    detail_type=query_info.get("detail_type"),
+                    semester=query_info.get("semester"),
+                    top_k=self.config.retriever.top_k,
+                    weight_semantic=weight_semantic
+                )
                 # Generate response with conversation context
                 response = self.response_generator.generate_response(
 #         # Test queries with different semantic weights
 #         queries = ["ค่าเทอมเท่าไหร่","เปิดเรียนวันไหน","ขั้นตอนการสมัครที่สาขานี้มีอะไรบ้าง","ต้องใช้ระดับภาษาอังกฤษเท่าไหร่ในการสมัครเรียนที่นี้","ถ้าจะไปติดต่อมาหลายต้องลง mrt อะไร","มีวิชาหลักเเละวิชาเลือกออะไรบ้าง", "ปีที่ 1 เทอม 1 ต้องเรียนอะไรบ้าง", "ปีที่ 2 เทอม 1 ต้องเรียนอะไรบ้าง"]
+#         # queries = ["ปีที่ 1 เทอม 1 ต้องเรียนอะไรบ้าง"]
 #         print("=" * 80)
 #         for query in queries: