JirasakJo commited on
Commit
e3973a0
·
verified ·
1 Parent(s): 40b188b

Update calendar_rag.py

Browse files
Files changed (1) hide show
  1. calendar_rag.py +920 -213
calendar_rag.py CHANGED
@@ -1,98 +1,257 @@
1
- from haystack import Pipeline, Document
2
  from haystack.components.generators.openai import OpenAIGenerator
3
  from haystack.components.builders import PromptBuilder
4
  from haystack.components.embedders import SentenceTransformersDocumentEmbedder
5
  from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
6
  from haystack.document_stores.in_memory import InMemoryDocumentStore
7
  from haystack.utils import Secret
 
8
  from pathlib import Path
9
- import logging
10
- from dataclasses import dataclass, field
11
- from typing import List, Dict, Any, Optional
 
 
 
 
 
12
  import json
13
- import asyncio
14
- from datetime import datetime
15
  import re
 
16
 
17
  # Setup logging
18
  logging.basicConfig(level=logging.INFO)
19
- logger = logging.getLogger(__name__)
20
 
21
- @dataclass
22
- class LocalizationConfig:
23
- """Configuration for Thai language handling"""
24
- thai_tokenizer_model: str = "thai-tokenizer"
25
- enable_thai_normalization: bool = True
26
- remove_thai_tones: bool = False
27
- keep_english: bool = True
28
- custom_stopwords: List[str] = field(default_factory=list)
29
- custom_synonyms: Dict[str, List[str]] = field(default_factory=dict)
30
-
31
- @dataclass
32
- class RetrieverConfig:
33
- """Configuration for document retrieval"""
34
- top_k: int = 5
35
- similarity_threshold: float = 0.7
36
- filter_duplicates: bool = True
37
 
38
- @dataclass
39
- class ModelConfig:
40
- """Configuration for language models"""
41
- openai_api_key: str
42
- temperature: float = 0.3
43
- max_tokens: int = 2000
44
- model: str = "gpt-4"
45
- embedder_model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  @dataclass
48
- class PipelineConfig:
49
- """Main configuration for the RAG pipeline"""
50
- model: ModelConfig
51
- retriever: RetrieverConfig = field(default_factory=RetrieverConfig)
52
- localization: LocalizationConfig = field(default_factory=LocalizationConfig)
53
-
54
- def __post_init__(self):
55
- if not self.model.openai_api_key:
56
- raise ValueError("OpenAI API key is required")
57
 
58
  class ThaiTextPreprocessor:
59
- """Thai text preprocessing utilities"""
60
 
61
- @staticmethod
62
- def normalize_thai_text(text: str) -> str:
63
- """Normalize Thai text"""
 
 
 
 
 
 
 
 
 
64
  if not text:
65
  return text
66
 
 
 
 
 
67
  # Normalize whitespace
68
  text = re.sub(r'\s+', ' ', text.strip())
69
 
70
- # Normalize Thai numerals
71
  thai_digits = '๐๑๒๓๔๕๖๗๘๙'
72
  arabic_digits = '0123456789'
 
73
  for thai, arabic in zip(thai_digits, arabic_digits):
74
  text = text.replace(thai, arabic)
75
 
76
  return text
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  class CalendarEvent:
79
- """Represents an academic calendar event"""
80
-
81
- def __init__(self,
82
- date: str,
83
- activity: str,
84
- semester: str,
85
- event_type: str = "academic",
86
- note: str = "",
87
- time: str = "",
88
- section: Optional[str] = None):
89
- self.date = date
90
- self.activity = activity
91
- self.semester = semester
92
- self.event_type = event_type
93
- self.note = note
94
- self.time = time
95
- self.section = section
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  def to_searchable_text(self) -> str:
98
  """Convert event to searchable text format"""
@@ -100,125 +259,703 @@ class CalendarEvent:
100
  ภาคการศึกษา: {self.semester}
101
  ประเภท: {self.event_type}
102
  วันที่: {self.date}
103
- เวลา: {self.time or '-'}
104
  กิจกรรม: {self.activity}
105
  หมวดหมู่: {self.section or '-'}
106
- หมายเหตุ: {self.note or '-'}
107
  """.strip()
108
 
109
- @staticmethod
110
- def from_dict(data: Dict[str, Any]) -> 'CalendarEvent':
111
- """Create event from dictionary"""
112
- return CalendarEvent(
113
- date=data.get('date', ''),
114
- activity=data.get('activity', ''),
115
- semester=data.get('semester', ''),
116
- event_type=data.get('event_type', 'academic'),
117
- note=data.get('note', ''),
118
- time=data.get('time', ''),
119
- section=data.get('section')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  )
121
 
122
- class CalendarRAG:
123
- """Main RAG pipeline for academic calendar"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  def __init__(self, config: PipelineConfig):
126
- """Initialize the pipeline with configuration"""
127
- self.config = config
128
- self.document_store = InMemoryDocumentStore()
129
  self.embedder = SentenceTransformersDocumentEmbedder(
130
  model=config.model.embedder_model
131
  )
132
- self.text_preprocessor = ThaiTextPreprocessor()
 
 
 
133
 
134
- # Initialize OpenAI components
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  self.generator = OpenAIGenerator(
136
  api_key=Secret.from_token(config.model.openai_api_key),
137
- model=config.model.model,
138
- temperature=config.model.temperature
139
  )
140
-
141
- self.query_analyzer = PromptBuilder(
142
  template="""
143
- วิเคราะห์คำถามเกี่ยวกับปฏิทินการศึกษานี้:
144
- คำถาม: {{query}}
145
 
146
- กรุณาระบุ:
147
- 1. ประเภทของข้อมูลที่ต้องการ
148
- 2. ภาคการศึกษาที่เกี่ยวข้อง
149
- 3. คำสำคัญที่ต้องค้นหา
150
 
151
- ตอบในรูปแบบ JSON:
152
  {
153
  "event_type": "registration|deadline|examination|academic|holiday",
154
- "semester": "ภาคการศึกษาที่ระบุ หรือ null",
155
- "key_terms": ["คำสำคัญไม่เกิน 3 คำ"]
 
156
  }
157
- """
158
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
- self.answer_generator = PromptBuilder(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  template="""
162
- คุณเป็นผู้ช่วยให้ข้อมูลปฏิทินการศึกษา กรุณาตอบคำถามต่อไปนี้โดยใช้ข้อมูลที่ให้มา:
163
 
164
- คำถาม: {{query}}
165
 
166
- ข้อมูลที่เกี่ยวข้อง:
167
- {% for doc in documents %}
168
  ---
169
  {{doc.content}}
170
  {% endfor %}
171
 
172
- คำแนะนำ:
173
- 1. ตอบเป็นภาษาไทย
174
- 2. ระบุวันที่และข้อกำหนดให้ชัดเจน
175
- 3. รวมหมายเหตุหรือเงื่อนไขที่สำคัญ
176
- """
177
- )
178
-
179
- def load_data(self, calendar_data: List[Dict[str, Any]]) -> None:
180
- """Load calendar data into the system"""
181
- documents = []
182
-
183
- for entry in calendar_data:
184
- # Create calendar event
185
- event = CalendarEvent.from_dict(entry)
186
 
187
- # Create searchable document
188
- doc = Document(
189
- content=event.to_searchable_text(),
190
- meta={
191
- "event_type": event.event_type,
192
- "semester": event.semester,
193
- "date": event.date
194
- }
 
 
 
 
 
 
 
 
 
 
 
195
  )
196
- documents.append(doc)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
- # Compute embeddings
199
- embedded_docs = self.embedder.run(documents=documents)["documents"]
 
 
 
200
 
201
- # Store documents
202
- self.document_store.write_documents(embedded_docs)
203
-
204
  def process_query(self, query: str) -> Dict[str, Any]:
205
- """Process a calendar query and return results"""
206
  try:
207
  # Analyze query
208
- query_info = self._analyze_query(query)
209
 
210
  # Retrieve relevant documents
211
- documents = self._retrieve_documents(
212
- query,
213
- event_type=query_info.get("event_type"),
214
- semester=query_info.get("semester")
 
215
  )
216
 
217
- # Generate answer
218
- answer = self._generate_answer(query, documents)
 
 
 
 
219
 
220
  return {
221
- "answer": answer,
222
  "documents": documents,
223
  "query_info": query_info
224
  }
@@ -231,86 +968,56 @@ class CalendarRAG:
231
  "query_info": {}
232
  }
233
 
234
- def _analyze_query(self, query: str) -> Dict[str, Any]:
235
- """Analyze and extract information from query"""
236
- try:
237
- # Normalize query
238
- normalized_query = self.text_preprocessor.normalize_thai_text(query)
239
-
240
- # Get analysis from OpenAI
241
- prompt_result = self.query_analyzer.run(query=normalized_query)
242
- response = self.generator.run(prompt=prompt_result["prompt"])
243
-
244
- if not response or not response.get("replies"):
245
- raise ValueError("Empty response from query analyzer")
246
-
247
- analysis = json.loads(response["replies"][0])
248
- analysis["original_query"] = query
249
-
250
- return analysis
251
-
252
- except Exception as e:
253
- logger.error(f"Query analysis failed: {str(e)}")
254
- return {
255
- "original_query": query,
256
- "event_type": None,
257
- "semester": None,
258
- "key_terms": []
259
- }
260
 
261
- def _retrieve_documents(self,
262
- query: str,
263
- event_type: Optional[str] = None,
264
- semester: Optional[str] = None) -> List[Document]:
265
- """Retrieve relevant documents"""
266
- # Create retriever
267
- retriever = InMemoryEmbeddingRetriever(
268
- document_store=self.document_store,
269
- top_k=self.config.retriever.top_k
270
- )
271
 
272
- # Get query embedding
273
- query_doc = Document(content=query)
274
- embedded_query = self.embedder.run(documents=[query_doc])["documents"][0]
275
 
276
- # Retrieve documents
277
- results = retriever.run(query_embedding=embedded_query.embedding)["documents"]
 
 
278
 
279
- # Filter results if needed
280
- filtered_results = []
281
- for doc in results:
282
- if event_type and doc.meta['event_type'] != event_type:
283
- continue
284
- if semester and doc.meta['semester'] != semester:
285
- continue
286
- filtered_results.append(doc)
287
 
288
- return filtered_results[:self.config.retriever.top_k]
289
-
290
- def _generate_answer(self, query: str, documents: List[Document]) -> str:
291
- """Generate answer from retrieved documents"""
292
- try:
293
- prompt_result = self.answer_generator.run(
294
- query=query,
295
- documents=documents
296
- )
297
-
298
- response = self.generator.run(prompt=prompt_result["prompt"])
299
 
300
- if not response or not response.get("replies"):
301
- raise ValueError("Empty response from answer generator")
 
 
 
302
 
303
- return response["replies"][0]
 
 
 
 
 
304
 
305
- except Exception as e:
306
- logger.error(f"Answer generation failed: {str(e)}")
307
- return "ขออภัย ไม่สามารถสร้างคำตอบได้ในขณะนี้"
308
 
309
- def create_default_config(api_key: str) -> PipelineConfig:
310
- """Create default pipeline configuration"""
311
- model_config = ModelConfig(openai_api_key=api_key)
312
- return PipelineConfig(
313
- model=model_config,
314
- retriever=RetrieverConfig(),
315
- localization=LocalizationConfig()
316
- )
 
1
+ from haystack import *
2
  from haystack.components.generators.openai import OpenAIGenerator
3
  from haystack.components.builders import PromptBuilder
4
  from haystack.components.embedders import SentenceTransformersDocumentEmbedder
5
  from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
6
  from haystack.document_stores.in_memory import InMemoryDocumentStore
7
  from haystack.utils import Secret
8
+ from tenacity import retry, stop_after_attempt, wait_exponential
9
  from pathlib import Path
10
+ import hashlib
11
+ from datetime import *
12
+ from typing import *
13
+ import numpy as np
14
+ from sklearn.metrics.pairwise import cosine_similarity
15
+ from rouge_score import rouge_scorer
16
+ import pandas as pd
17
+ from dataclasses import *
18
  import json
19
+ import logging
20
+ import os
21
  import re
22
+ import pickle
23
 
24
  # Setup logging
25
  logging.basicConfig(level=logging.INFO)
 
26
 
27
+ logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
+ class OpenAIDateParser:
30
+ """Uses OpenAI to parse complex Thai date formats"""
31
+
32
+ def __init__(self, api_key: str, model: str = "gpt-4"):
33
+ self.generator = OpenAIGenerator(
34
+ api_key=Secret.from_token(api_key),
35
+ model=model
36
+ )
37
+ self.prompt_builder = PromptBuilder(
38
+ template="""
39
+ Parse the following Thai date range into a structured format:
40
+ Date: {{date}}
41
+
42
+ Return in JSON format:
43
+ {
44
+ "start_date": "YYYY-MM-DD",
45
+ "end_date": "YYYY-MM-DD" (if range),
46
+ "is_range": true/false
47
+ }
48
+
49
+ Notes:
50
+ - Convert Buddhist Era (BE) to CE
51
+ - Handle abbreviated Thai months
52
+ - Account for date ranges with dashes
53
+ - Return null for end_date if it's a single date
54
+
55
+ Example inputs and outputs:
56
+ Input: "จ 8 ก.ค. – จ 19 ส.ค. 67"
57
+ Output: {"start_date": "2024-07-08", "end_date": "2024-08-19", "is_range": true}
58
+
59
+ Input: "15 มกราคม 2567"
60
+ Output: {"start_date": "2024-01-15", "end_date": null, "is_range": false}
61
+ """
62
+ )
63
+
64
+ async def parse_date(self, date_str: str) -> Dict[str, Union[str, bool]]:
65
+ """Parse complex Thai date format using OpenAI"""
66
+ try:
67
+ # Build prompt
68
+ result = self.prompt_builder.run(date=date_str)
69
+
70
+ # Get OpenAI response
71
+ response = await self.generator.arun(prompt=result["prompt"])
72
+
73
+ if not response or not response.get("replies"):
74
+ raise ValueError("Empty response from OpenAI")
75
+
76
+ # Parse JSON response
77
+ parsed = json.loads(response["replies"][0])
78
+
79
+ # Validate the parsed dates
80
+ for date_field in ['start_date', 'end_date']:
81
+ if parsed.get(date_field):
82
+ datetime.strptime(parsed[date_field], '%Y-%m-%d')
83
+
84
+ return parsed
85
+
86
+ except Exception as e:
87
+ logger.error(f"OpenAI date parsing failed for '{date_str}': {str(e)}")
88
+ raise ValueError(f"Could not parse date: {date_str}")
89
 
90
  @dataclass
91
+ class ValidationResult:
92
+ """Stores the result of a validation check"""
93
+ is_valid: bool
94
+ errors: List[str]
95
+ warnings: List[str]
96
+ normalized_data: Dict[str, str]
 
 
 
97
 
98
  class ThaiTextPreprocessor:
99
+ """Handles Thai text preprocessing and normalization"""
100
 
101
+ # Thai character normalization mappings
102
+ CHAR_MAP = {
103
+ 'ํา': 'ำ', # Normalize sara am
104
+ '์': '', # Remove yamakkan
105
+ '–': '-', # Normalize dashes
106
+ '—': '-',
107
+ '٫': ',', # Normalize separators
108
+ }
109
+
110
+ @classmethod
111
+ def normalize_thai_text(cls, text: str) -> str:
112
+ """Normalize Thai text by applying character mappings and spacing rules"""
113
  if not text:
114
  return text
115
 
116
+ # Apply character mappings
117
+ for old, new in cls.CHAR_MAP.items():
118
+ text = text.replace(old, new)
119
+
120
  # Normalize whitespace
121
  text = re.sub(r'\s+', ' ', text.strip())
122
 
123
+ # Normalize Thai numerals if present
124
  thai_digits = '๐๑๒๓๔๕๖๗๘๙'
125
  arabic_digits = '0123456789'
126
+
127
  for thai, arabic in zip(thai_digits, arabic_digits):
128
  text = text.replace(thai, arabic)
129
 
130
  return text
131
 
132
+ class CalendarEventValidator:
133
+ """Validates and preprocesses calendar events"""
134
+
135
+ def __init__(self, openai_api_key: str):
136
+ self.preprocessor = ThaiTextPreprocessor()
137
+ self.date_parser = OpenAIDateParser(api_key=openai_api_key)
138
+
139
+ async def validate_event(self, event: 'CalendarEvent') -> ValidationResult:
140
+ """Validate a calendar event and return validation results"""
141
+ errors = []
142
+ warnings = []
143
+ normalized_data = {}
144
+
145
+ # Validate and normalize date using OpenAI
146
+ if event.date:
147
+ try:
148
+ parsed_date = await self.date_parser.parse_date(event.date)
149
+ normalized_data['date'] = parsed_date['start_date']
150
+
151
+ # If it's a date range, store it in the note
152
+ if parsed_date['is_range'] and parsed_date['end_date']:
153
+ range_note = f"ถึงวันที่ {parsed_date['end_date']}"
154
+ if event.note:
155
+ normalized_data['note'] = f"{event.note}; {range_note}"
156
+ else:
157
+ normalized_data['note'] = range_note
158
+
159
+ except ValueError as e:
160
+ errors.append(f"Invalid date format: {event.date}")
161
+ else:
162
+ errors.append("Date is required")
163
+
164
+ # Validate time format if provided
165
+ if event.time:
166
+ time_pattern = r'^([01]?[0-9]|2[0-3]):([0-5][0-9])$'
167
+ if not re.match(time_pattern, event.time):
168
+ errors.append(f"Invalid time format: {event.time}")
169
+ normalized_data['time'] = event.time
170
+
171
+ # Validate and normalize activity
172
+ if event.activity:
173
+ normalized_activity = self.preprocessor.normalize_thai_text(event.activity)
174
+ if len(normalized_activity) < 3:
175
+ warnings.append("Activity description is very short")
176
+ normalized_data['activity'] = normalized_activity
177
+ else:
178
+ errors.append("Activity is required")
179
+
180
+ # Validate semester
181
+ valid_semesters = {'ภาคต้น', 'ภาคปลาย', 'ภาคฤดูร้อน'}
182
+ if event.semester:
183
+ normalized_semester = self.preprocessor.normalize_thai_text(event.semester)
184
+ if normalized_semester not in valid_semesters:
185
+ warnings.append(f"Unusual semester value: {event.semester}")
186
+ normalized_data['semester'] = normalized_semester
187
+ else:
188
+ errors.append("Semester is required")
189
+
190
+ # Validate event type
191
+ valid_types = {'registration', 'deadline', 'examination', 'academic', 'holiday'}
192
+ if event.event_type not in valid_types:
193
+ errors.append(f"Invalid event type: {event.event_type}")
194
+ normalized_data['event_type'] = event.event_type
195
+
196
+ # Normalize note if present and not already set by date range
197
+ if event.note and 'note' not in normalized_data:
198
+ normalized_data['note'] = self.preprocessor.normalize_thai_text(event.note)
199
+
200
+ # Normalize section if present
201
+ if event.section:
202
+ normalized_data['section'] = self.preprocessor.normalize_thai_text(event.section)
203
+
204
+ return ValidationResult(
205
+ is_valid=len(errors) == 0,
206
+ errors=errors,
207
+ warnings=warnings,
208
+ normalized_data=normalized_data
209
+ )
210
+
211
+ # Update CalendarEvent class to include async validation
212
+ @dataclass
213
  class CalendarEvent:
214
+ """Structured representation of a calendar event with validation"""
215
+
216
+ @staticmethod
217
+ def classify_event_type(activity: str) -> str:
218
+ """Classify event type based on activity description"""
219
+ activity_lower = activity.lower()
220
+
221
+ keywords = {
222
+ 'registration': ['ลงทะเบียน', 'ชําระเงิน', 'ค่าธรรมเนียม', 'เปิดเรียน'],
223
+ 'deadline': ['วันสุดท้าย', 'กําหนด', 'ภายใน', 'ต้องส่ง'],
224
+ 'examination': ['สอบ', 'ปริญญานิพนธ์', 'วิทยานิพนธ์', 'สอบปากเปล่า'],
225
+ 'holiday': ['วันหยุด', 'ชดเชย', 'เทศกาล'],
226
+ }
227
+
228
+ for event_type, terms in keywords.items():
229
+ if any(term in activity_lower for term in terms):
230
+ return event_type
231
+ return 'academic'
232
+ date: str
233
+ time: str
234
+ activity: str
235
+ note: str
236
+ semester: str
237
+ event_type: str
238
+ section: Optional[str] = None
239
+
240
+ async def initialize(self, openai_api_key: str):
241
+ """Asynchronously validate and normalize the event"""
242
+ validator = CalendarEventValidator(openai_api_key)
243
+ result = await validator.validate_event(self)
244
+
245
+ if not result.is_valid:
246
+ raise ValueError(f"Invalid calendar event: {', '.join(result.errors)}")
247
+
248
+ # Update with normalized data
249
+ for field, value in result.normalized_data.items():
250
+ setattr(self, field, value)
251
+
252
+ # Log any warnings
253
+ if result.warnings:
254
+ logger.warning(f"Calendar event warnings: {', '.join(result.warnings)}")
255
 
256
  def to_searchable_text(self) -> str:
257
  """Convert event to searchable text format"""
 
259
  ภาคการศึกษา: {self.semester}
260
  ประเภท: {self.event_type}
261
  วันที่: {self.date}
262
+ เวลา: {self.time}
263
  กิจกรรม: {self.activity}
264
  หมวดหมู่: {self.section or '-'}
265
+ หมายเหตุ: {self.note}
266
  """.strip()
267
 
268
+ class CacheManager:
269
+ """Manages caching for different components of the RAG pipeline"""
270
+
271
+ def __init__(self, cache_dir: Path, ttl: int = 3600):
272
+ """
273
+ Initialize CacheManager
274
+
275
+ Args:
276
+ cache_dir: Directory to store cache files
277
+ ttl: Time-to-live in seconds for cache entries (default: 1 hour)
278
+ """
279
+ self.cache_dir = cache_dir
280
+ self.ttl = ttl
281
+ self.embeddings_cache = self._load_cache("embeddings")
282
+ self.query_cache = self._load_cache("queries")
283
+ self.document_cache = self._load_cache("documents")
284
+
285
+ def _generate_key(self, data: Union[str, Dict, Any]) -> str:
286
+ """Generate a unique cache key"""
287
+ if isinstance(data, str):
288
+ content = data.encode('utf-8')
289
+ else:
290
+ content = json.dumps(data, sort_keys=True).encode('utf-8')
291
+ return hashlib.md5(content).hexdigest()
292
+
293
+ def _load_cache(self, cache_type: str) -> Dict:
294
+ """Load cache from disk"""
295
+ cache_path = self.cache_dir / f"{cache_type}_cache.pkl"
296
+ if cache_path.exists():
297
+ try:
298
+ with open(cache_path, 'rb') as f:
299
+ cache = pickle.load(f)
300
+ # Clean expired entries
301
+ self._clean_expired_entries(cache)
302
+ return cache
303
+ except Exception as e:
304
+ logger.warning(f"Failed to load {cache_type} cache: {e}")
305
+ return {}
306
+ return {}
307
+
308
+ def _save_cache(self, cache_type: str, cache_data: Dict):
309
+ """Save cache to disk"""
310
+ cache_path = self.cache_dir / f"{cache_type}_cache.pkl"
311
+ try:
312
+ with open(cache_path, 'wb') as f:
313
+ pickle.dump(cache_data, f)
314
+ except Exception as e:
315
+ logger.error(f"Failed to save {cache_type} cache: {e}")
316
+
317
+ def _clean_expired_entries(self, cache: Dict):
318
+ """Remove expired cache entries"""
319
+ current_time = datetime.now()
320
+ expired_keys = [
321
+ key for key, (_, timestamp) in cache.items()
322
+ if current_time - timestamp > timedelta(seconds=self.ttl)
323
+ ]
324
+ for key in expired_keys:
325
+ del cache[key]
326
+
327
+ def get_embedding_cache(self, text: str) -> Optional[Any]:
328
+ """Get cached embedding for text"""
329
+ key = self._generate_key(text)
330
+ if key in self.embeddings_cache:
331
+ embedding, timestamp = self.embeddings_cache[key]
332
+ if datetime.now() - timestamp <= timedelta(seconds=self.ttl):
333
+ return embedding
334
+ return None
335
+
336
+ def set_embedding_cache(self, text: str, embedding: Any):
337
+ """Cache embedding for text"""
338
+ key = self._generate_key(text)
339
+ self.embeddings_cache[key] = (embedding, datetime.now())
340
+ self._save_cache("embeddings", self.embeddings_cache)
341
+
342
+ def get_query_cache(self, query: str) -> Optional[Dict]:
343
+ """Get cached query results"""
344
+ key = self._generate_key(query)
345
+ if key in self.query_cache:
346
+ result, timestamp = self.query_cache[key]
347
+ if datetime.now() - timestamp <= timedelta(seconds=self.ttl):
348
+ return result
349
+ return None
350
+
351
+ def set_query_cache(self, query: str, result: Dict):
352
+ """Cache query results"""
353
+ key = self._generate_key(query)
354
+ self.query_cache[key] = (result, datetime.now())
355
+ self._save_cache("queries", self.query_cache)
356
+
357
+ def get_document_cache(self, doc_id: str) -> Optional[Any]:
358
+ """Get cached document"""
359
+ if doc_id in self.document_cache:
360
+ doc, timestamp = self.document_cache[doc_id]
361
+ if datetime.now() - timestamp <= timedelta(seconds=self.ttl):
362
+ return doc
363
+ return None
364
+
365
+ def set_document_cache(self, doc_id: str, document: Any):
366
+ """Cache document"""
367
+ self.document_cache[doc_id] = (document, datetime.now())
368
+ self._save_cache("documents", self.document_cache)
369
+
370
+ def clear_cache(self, cache_type: Optional[str] = None):
371
+ """Clear specific or all caches"""
372
+ if cache_type == "embeddings":
373
+ self.embeddings_cache.clear()
374
+ self._save_cache("embeddings", self.embeddings_cache)
375
+ elif cache_type == "queries":
376
+ self.query_cache.clear()
377
+ self._save_cache("queries", self.query_cache)
378
+ elif cache_type == "documents":
379
+ self.document_cache.clear()
380
+ self._save_cache("documents", self.document_cache)
381
+ else:
382
+ self.embeddings_cache.clear()
383
+ self.query_cache.clear()
384
+ self.document_cache.clear()
385
+ for cache_type in ["embeddings", "queries", "documents"]:
386
+ self._save_cache(cache_type, {})
387
+
388
+ @dataclass
389
+ class ModelConfig:
390
+ """Configuration for language models and embeddings"""
391
+ openai_api_key: str
392
+ embedder_model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
393
+ openai_model: str = "gpt-4o"
394
+ temperature: float = 0.7
395
+ max_tokens: int = 2000
396
+ top_p: float = 0.95
397
+ frequency_penalty: float = 0.0
398
+ presence_penalty: float = 0.0
399
+
400
+ @dataclass
401
+ class RetrieverConfig:
402
+ """Configuration for document retrieval"""
403
+ top_k: int = 5
404
+ similarity_threshold: float = 0.7
405
+ reranking_enabled: bool = False
406
+ reranking_model: Optional[str] = None
407
+ filter_duplicates: bool = True
408
+ min_document_length: int = 10
409
+
410
+ @dataclass
411
+ class CacheConfig:
412
+ """Configuration for caching behavior"""
413
+ enabled: bool = True
414
+ cache_dir: Path = field(default_factory=lambda: Path("./cache"))
415
+ embeddings_cache_ttl: int = 86400 # 24 hours
416
+ query_cache_ttl: int = 3600 # 1 hour
417
+ max_cache_size: int = 1000 # entries
418
+ cache_cleanup_interval: int = 3600 # 1 hour
419
+
420
+ @dataclass
421
+ class ProcessingConfig:
422
+ """Configuration for data processing"""
423
+ batch_size: int = 32
424
+ max_retries: int = 3
425
+ timeout: int = 30
426
+ max_concurrent_requests: int = 5
427
+ chunk_size: int = 512
428
+ chunk_overlap: int = 50
429
+ preprocessing_workers: int = 4
430
+
431
+ @dataclass
432
+ class MonitoringConfig:
433
+ """Configuration for monitoring and logging"""
434
+ enable_monitoring: bool = True
435
+ log_level: str = "INFO"
436
+ metrics_enabled: bool = True
437
+ trace_enabled: bool = True
438
+ performance_logging: bool = True
439
+ slow_query_threshold: float = 5.0 # seconds
440
+ health_check_interval: int = 300 # 5 minutes
441
+
442
+ @dataclass
443
+ class LocalizationConfig:
444
+ """Configuration for Thai language handling"""
445
+ thai_tokenizer_model: str = "thai-tokenizer"
446
+ enable_thai_normalization: bool = True
447
+ remove_thai_tones: bool = False
448
+ keep_english: bool = True
449
+ custom_stopwords: List[str] = field(default_factory=list)
450
+ custom_synonyms: Dict[str, List[str]] = field(default_factory=dict)
451
+
452
+ @dataclass
453
+ class PipelineConfig:
454
+ """Main configuration for the RAG pipeline"""
455
+ # Model configurations
456
+ model: ModelConfig
457
+
458
+ # Retriever settings
459
+ retriever: RetrieverConfig = field(default_factory=RetrieverConfig)
460
+
461
+ # Cache settings
462
+ cache: CacheConfig = field(default_factory=CacheConfig)
463
+
464
+ # Processing settings
465
+ processing: ProcessingConfig = field(default_factory=ProcessingConfig)
466
+
467
+ # Monitoring settings
468
+ monitoring: MonitoringConfig = field(default_factory=MonitoringConfig)
469
+
470
+ # Localization settings
471
+ localization: LocalizationConfig = field(default_factory=LocalizationConfig)
472
+
473
+ # Rate limiting
474
+ rate_limit_enabled: bool = True
475
+ requests_per_minute: int = 60
476
+
477
+ # System settings
478
+ debug_mode: bool = False
479
+ development_mode: bool = False
480
+
481
+ def __post_init__(self):
482
+ """Validate configuration and create necessary directories"""
483
+ if not self.model.openai_api_key:
484
+ raise ValueError("OpenAI API key is required")
485
+
486
+ if self.cache.enabled:
487
+ self.cache.cache_dir.mkdir(parents=True, exist_ok=True)
488
+
489
+ def to_dict(self) -> Dict[str, Any]:
490
+ """Convert configuration to dictionary format"""
491
+ return {
492
+ "model_config": {
493
+ "embedder_model": self.model.embedder_model,
494
+ "openai_model": self.model.openai_model,
495
+ "temperature": self.model.temperature,
496
+ # Add other relevant fields
497
+ },
498
+ "retriever_config": {
499
+ "top_k": self.retriever.top_k,
500
+ "similarity_threshold": self.retriever.similarity_threshold,
501
+ # Add other relevant fields
502
+ },
503
+ # Add other configuration sections
504
+ }
505
+
506
+ @classmethod
507
+ def from_dict(cls, config_dict: Dict[str, Any]) -> 'PipelineConfig':
508
+ """Create configuration from dictionary"""
509
+ model_config = ModelConfig(**config_dict.get("model_config", {}))
510
+ retriever_config = RetrieverConfig(**config_dict.get("retriever_config", {}))
511
+ # Create other config objects
512
+
513
+ return cls(
514
+ model=model_config,
515
+ retriever=retriever_config,
516
+ # Add other configuration objects
517
  )
518
 
519
+ def create_default_config(api_key: str) -> PipelineConfig:
520
+ """Create a default configuration with the given API key"""
521
+ model_config = ModelConfig(
522
+ openai_api_key=api_key,
523
+ embedder_model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
524
+ )
525
+ return PipelineConfig(
526
+ model=model_config,
527
+ retriever=RetrieverConfig(),
528
+ cache=CacheConfig(),
529
+ processing=ProcessingConfig(),
530
+ monitoring=MonitoringConfig(),
531
+ localization=LocalizationConfig()
532
+ )
533
+
534
+ class CalendarDataProcessor:
535
+ """Process and structure calendar data"""
536
+
537
+ @staticmethod
538
+ def parse_calendar_json(json_data: List[Dict]) -> List[CalendarEvent]:
539
+ events = []
540
+
541
+ for semester_data in json_data:
542
+ semester = semester_data['education']
543
+
544
+ # Process regular schedule events
545
+ for event in semester_data.get('schedule', []):
546
+ # Check if this is a regular event or a section with details
547
+ if 'section' in event and 'details' in event:
548
+ # This is a section with details
549
+ section = event['section']
550
+ for detail in event['details']:
551
+ # Extract semester-specific information if available
552
+ if 'ภาคต้น' in detail and 'ภาคปลาย' in detail:
553
+ # Handle both semesters
554
+ semesters = ['ภาคต้น', 'ภาคปลาย']
555
+ for sem in semesters:
556
+ events.append(CalendarEvent(
557
+ date=detail.get(sem, ''),
558
+ time='',
559
+ activity=detail.get('title', ''),
560
+ note=section,
561
+ semester=sem,
562
+ event_type='deadline',
563
+ section=section
564
+ ))
565
+ else:
566
+ # Single event
567
+ events.append(CalendarEvent(
568
+ date=detail.get('date', ''),
569
+ time='',
570
+ activity=detail.get('title', ''),
571
+ note=section,
572
+ semester=semester,
573
+ event_type='deadline',
574
+ section=section
575
+ ))
576
+ else:
577
+ # This is a regular event
578
+ event_type = CalendarEvent.classify_event_type(event.get('activity', ''))
579
+ events.append(CalendarEvent(
580
+ date=event.get('date', ''),
581
+ time=event.get('time', ''),
582
+ activity=event.get('activity', ''),
583
+ note=event.get('note', ''),
584
+ semester=semester,
585
+ event_type=event_type
586
+ ))
587
+
588
+ return events
589
+
590
+ # Update the EnhancedDocumentStore class to use caching
591
+ class EnhancedDocumentStore:
592
+ """Enhanced document store with caching capabilities"""
593
 
594
  def __init__(self, config: PipelineConfig):
595
+ self.store = InMemoryDocumentStore()
 
 
596
  self.embedder = SentenceTransformersDocumentEmbedder(
597
  model=config.model.embedder_model
598
  )
599
+ self.cache_manager = CacheManager(
600
+ cache_dir=config.cache.cache_dir,
601
+ ttl=config.cache.embeddings_cache_ttl
602
+ )
603
 
604
+ # Configure for Thai text
605
+ self.embedder.warm_up()
606
+
607
+ self.events = []
608
+ self.event_type_index = {}
609
+ self.semester_index = {}
610
+
611
+ def _compute_embedding(self, text: str) -> Any:
612
+ """Compute embedding with caching"""
613
+ cached_embedding = self.cache_manager.get_embedding_cache(text)
614
+ if cached_embedding is not None:
615
+ return cached_embedding
616
+
617
+ doc = Document(content=text)
618
+ embedding = self.embedder.run(documents=[doc])["documents"][0].embedding
619
+ self.cache_manager.set_embedding_cache(text, embedding)
620
+ return embedding
621
+
622
+ def add_events(self, events: List[CalendarEvent]):
623
+ """Add events with caching"""
624
+ documents = []
625
+
626
+ for event in events:
627
+ # Store event
628
+ self.events.append(event)
629
+ event_idx = len(self.events) - 1
630
+
631
+ # Update indices
632
+ if event.event_type not in self.event_type_index:
633
+ self.event_type_index[event.event_type] = []
634
+ self.event_type_index[event.event_type].append(event_idx)
635
+
636
+ if event.semester not in self.semester_index:
637
+ self.semester_index[event.semester] = []
638
+ self.semester_index[event.semester].append(event_idx)
639
+
640
+ # Create document with cached embedding
641
+ text = event.to_searchable_text()
642
+ embedding = self._compute_embedding(text)
643
+
644
+ doc = Document(
645
+ content=text,
646
+ embedding=embedding,
647
+ meta={
648
+ 'event_type': event.event_type,
649
+ 'semester': event.semester,
650
+ 'date': event.date
651
+ }
652
+ )
653
+ documents.append(doc)
654
+
655
+ # Cache document
656
+ self.cache_manager.set_document_cache(str(event_idx), doc)
657
+
658
+ # Store documents
659
+ self.store.write_documents(documents)
660
+
661
+ def search(self,
662
+ query: str,
663
+ event_type: Optional[str] = None,
664
+ semester: Optional[str] = None,
665
+ top_k: int = 5) -> List[Document]:
666
+ """Search with query caching"""
667
+ # Check cache first
668
+ cache_key = json.dumps({
669
+ 'query': query,
670
+ 'event_type': event_type,
671
+ 'semester': semester,
672
+ 'top_k': top_k
673
+ })
674
+ cached_results = self.cache_manager.get_query_cache(cache_key)
675
+ if cached_results is not None:
676
+ return cached_results
677
+
678
+ # Compute query embedding
679
+ query_embedding = self._compute_embedding(query)
680
+
681
+ # Perform search
682
+ retriever = InMemoryEmbeddingRetriever(
683
+ document_store=self.store,
684
+ top_k=top_k * 2
685
+ )
686
+
687
+ results = retriever.run(query_embedding=query_embedding)["documents"]
688
+
689
+ # Filter results
690
+ filtered_results = []
691
+ for doc in results:
692
+ if event_type and doc.meta['event_type'] != event_type:
693
+ continue
694
+ if semester and doc.meta['semester'] != semester:
695
+ continue
696
+ filtered_results.append(doc)
697
+
698
+ final_results = filtered_results[:top_k]
699
+
700
+ # Cache results
701
+ self.cache_manager.set_query_cache(cache_key, final_results)
702
+
703
+ return final_results
704
+
705
+ class AdvancedQueryProcessor:
706
+ """Process queries with better understanding"""
707
+
708
+ def __init__(self, config: PipelineConfig):
709
  self.generator = OpenAIGenerator(
710
  api_key=Secret.from_token(config.model.openai_api_key),
711
+ model=config.model.openai_model
 
712
  )
713
+ self.prompt_builder = PromptBuilder(
 
714
  template="""
715
+ Analyze this academic calendar query (in Thai):
716
+ Query: {{query}}
717
 
718
+ Determine:
719
+ 1. The type of information being requested
720
+ 2. Any specific semester mentioned
721
+ 3. Key terms to look for
722
 
723
+ Return as JSON:
724
  {
725
  "event_type": "registration|deadline|examination|academic|holiday",
726
+ "semester": "term mentioned or null",
727
+ "key_terms": ["up to 3 most important terms"],
728
+ "response_format": "list|single|detailed"
729
  }
730
+ """)
731
+
732
+ def process_query(self, query: str) -> Dict[str, Any]:
733
+ """Process and analyze query"""
734
+ try:
735
+ # Get analysis
736
+ result = self.prompt_builder.run(query=query)
737
+ response = self.generator.run(prompt=result["prompt"])
738
+
739
+ # Add validation for empty response
740
+ if not response or not response.get("replies") or not response["replies"][0]:
741
+ logger.warning("Received empty response from generator")
742
+ return self._get_default_analysis(query)
743
+
744
+ try:
745
+ # Parse response with error handling
746
+ analysis = json.loads(response["replies"][0])
747
+
748
+ # Validate required fields
749
+ required_fields = ["event_type", "semester", "key_terms", "response_format"]
750
+ for field in required_fields:
751
+ if field not in analysis:
752
+ logger.warning(f"Missing required field: {field}")
753
+ return self._get_default_analysis(query)
754
+
755
+ return {
756
+ "original_query": query,
757
+ **analysis
758
+ }
759
+
760
+ except json.JSONDecodeError as je:
761
+ logger.error(f"JSON parsing failed: {str(je)}")
762
+ return self._get_default_analysis(query)
763
+
764
+ except Exception as e:
765
+ logger.error(f"Query processing failed: {str(e)}")
766
+ return self._get_default_analysis(query)
767
+
768
+ def _get_default_analysis(self, query: str) -> Dict[str, Any]:
769
+ """Return default analysis when processing fails"""
770
+ logger.info("Returning default analysis")
771
+ return {
772
+ "original_query": query,
773
+ "event_type": None,
774
+ "semester": None,
775
+ "key_terms": [],
776
+ "response_format": "detailed"
777
+ }
778
+
779
+ @dataclass
780
+ class RateLimitConfig:
781
+ """Configuration for rate limiting"""
782
+ requests_per_minute: int = 60
783
+ max_retries: int = 3
784
+ base_delay: float = 1.0
785
+ max_delay: float = 60.0
786
+ timeout: float = 30.0
787
+ concurrent_requests: int = 5
788
+
789
+ class APIError(Exception):
790
+ """Base class for API related errors"""
791
+ def __init__(self, message: str, status_code: Optional[int] = None, response: Optional[Dict] = None):
792
+ super().__init__(message)
793
+ self.status_code = status_code
794
+ self.response = response
795
+
796
+ class RateLimitExceededError(APIError):
797
+ """Raised when rate limit is exceeded"""
798
+ pass
799
+
800
+ class OpenAIRateLimiter:
801
+ """Rate limiter with advanced error handling for OpenAI API"""
802
+
803
+ def __init__(self, config: RateLimitConfig):
804
+ self.config = config
805
+ self.requests = deque(maxlen=config.requests_per_minute)
806
+ self.semaphore = asyncio.Semaphore(config.concurrent_requests)
807
+ self.total_requests = 0
808
+ self.errors = deque(maxlen=1000) # Store recent errors
809
+ self.start_time = datetime.now()
810
+
811
+ async def acquire(self):
812
+ """Acquire permission to make a request"""
813
+ now = time.time()
814
 
815
+ # Clean old requests
816
+ while self.requests and self.requests[0] < now - 60:
817
+ self.requests.popleft()
818
+
819
+ # Check if we're at the limit
820
+ if len(self.requests) >= self.config.requests_per_minute:
821
+ wait_time = 60 - (now - self.requests[0])
822
+ logger.warning(f"Rate limit reached. Waiting {wait_time:.2f} seconds")
823
+ await asyncio.sleep(wait_time)
824
+
825
+ # Add new request timestamp
826
+ self.requests.append(now)
827
+ self.total_requests += 1
828
+
829
+ def get_usage_stats(self) -> Dict[str, Any]:
830
+ """Get current usage statistics"""
831
+ return {
832
+ "total_requests": self.total_requests,
833
+ "current_rpm": len(self.requests),
834
+ "uptime": (datetime.now() - self.start_time).total_seconds(),
835
+ "error_rate": len(self.errors) / self.total_requests if self.total_requests > 0 else 0
836
+ }
837
+
838
+ @retry(
839
+ stop=stop_after_attempt(3),
840
+ wait=wait_exponential(multiplier=1, min=4, max=60),
841
+ reraise=True
842
+ )
843
+ async def execute_with_retry(self, func, *args, **kwargs):
844
+ """Execute API call with retry logic"""
845
+ try:
846
+ async with self.semaphore:
847
+ await self.acquire()
848
+ return await func(*args, **kwargs)
849
+
850
+ except Exception as e:
851
+ error_info = {
852
+ "timestamp": datetime.now(),
853
+ "error_type": type(e).__name__,
854
+ "message": str(e)
855
+ }
856
+ self.errors.append(error_info)
857
+
858
+ if isinstance(e, RateLimitExceededError):
859
+ logger.warning("Rate limit exceeded, backing off...")
860
+ await asyncio.sleep(self.config.base_delay)
861
+ raise
862
+
863
+ elif "timeout" in str(e).lower():
864
+ logger.error(f"Timeout error: {str(e)}")
865
+ raise APIError(f"Request timed out after {self.config.timeout} seconds")
866
+
867
+ else:
868
+ logger.error(f"API error: {str(e)}")
869
+ raise
870
+
871
+ class ResponseGenerator:
872
+ """Generate responses with better context utilization"""
873
+
874
+ def __init__(self, config: PipelineConfig):
875
+ self.generator = OpenAIGenerator(
876
+ api_key=Secret.from_token(config.model.openai_api_key),
877
+ model=config.model.openai_model
878
+ )
879
+ self.prompt_builder = PromptBuilder(
880
  template="""
881
+ You are a helpful academic advisor. Answer the following query using the provided calendar information.
882
 
883
+ Query: {{query}}
884
 
885
+ Relevant Calendar Information:
886
+ {% for doc in context %}
887
  ---
888
  {{doc.content}}
889
  {% endfor %}
890
 
891
+ Format: {{format}}
 
 
 
 
 
 
 
 
 
 
 
 
 
892
 
893
+ Guidelines:
894
+ 1. Answer in Thai language
895
+ 2. Be specific about dates and requirements
896
+ 3. Include relevant notes or conditions
897
+ 4. Format the response according to the specified format
898
+
899
+ Provide your response:
900
+ """)
901
+
902
+ def generate_response(self,
903
+ query: str,
904
+ documents: List[Document],
905
+ query_info: Dict[str, Any]) -> str:
906
+ """Generate response using retrieved documents"""
907
+ try:
908
+ result = self.prompt_builder.run(
909
+ query=query,
910
+ context=documents,
911
+ format=query_info["response_format"]
912
  )
913
+
914
+ response = self.generator.run(prompt=result["prompt"])
915
+ return response["replies"][0]
916
+
917
+ except Exception as e:
918
+ logger.error(f"Response generation failed: {str(e)}")
919
+ return "ขออภัย ไม่สามารถประมวลผลคำตอบได้ในขณะนี้"
920
+
921
+ class AcademicCalendarRAG:
922
+ """Main RAG pipeline for academic calendar queries"""
923
+
924
+ def __init__(self, config: PipelineConfig):
925
+ self.config = config
926
+ self.document_store = EnhancedDocumentStore(config)
927
+ self.query_processor = AdvancedQueryProcessor(config)
928
+ self.response_generator = ResponseGenerator(config)
929
 
930
+ def load_data(self, json_data: List[Dict]):
931
+ """Load and process calendar data"""
932
+ processor = CalendarDataProcessor()
933
+ events = processor.parse_calendar_json(json_data)
934
+ self.document_store.add_events(events)
935
 
 
 
 
936
  def process_query(self, query: str) -> Dict[str, Any]:
937
+ """Process query and generate response"""
938
  try:
939
  # Analyze query
940
+ query_info = self.query_processor.process_query(query)
941
 
942
  # Retrieve relevant documents
943
+ documents = self.document_store.search(
944
+ query=query,
945
+ event_type=query_info["event_type"],
946
+ semester=query_info["semester"],
947
+ top_k=self.config.retriever.top_k
948
  )
949
 
950
+ # Generate response
951
+ response = self.response_generator.generate_response(
952
+ query=query,
953
+ documents=documents,
954
+ query_info=query_info
955
+ )
956
 
957
  return {
958
+ "answer": response,
959
  "documents": documents,
960
  "query_info": query_info
961
  }
 
968
  "query_info": {}
969
  }
970
 
971
+ def main():
972
+ """Main function for processing real calendar queries"""
973
+ try:
974
+ # Load API key
975
+ with open("key.txt", "r") as f:
976
+ openai_api_key = f.read().strip()
977
+
978
+ # Use create_default_config instead of direct PipelineConfig initialization
979
+ config = create_default_config(openai_api_key)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
980
 
981
+ # Customize config for Thai academic calendar use case
982
+ config.localization.enable_thai_normalization = True
983
+ config.retriever.top_k = 5 # Adjust based on your needs
984
+ config.model.temperature = 0.3 # Lower temperature for more focused responses
 
 
 
 
 
 
985
 
986
+ # Initialize pipeline with enhanced config
987
+ pipeline = AcademicCalendarRAG(config)
 
988
 
989
+ # Load calendar data
990
+ with open("calendar.json", "r", encoding="utf-8") as f:
991
+ calendar_data = json.load(f)
992
+ pipeline.load_data(calendar_data)
993
 
994
+ # Real queries to process
995
+ queries = input in web
 
 
 
 
 
 
996
 
997
+ print("Processing calendar queries...")
998
+ print("=" * 80)
999
+
1000
+ for query in queries:
1001
+ result = pipeline.process_query(query)
1002
+ print(f"\nQuery: {query}")
1003
+ print(f"Answer: {result['answer']}")
 
 
 
 
1004
 
1005
+ # # Print retrieved documents for verification
1006
+ # print("\nRetrieved Documents:")
1007
+ # for i, doc in enumerate(result['documents'], 1):
1008
+ # print(f"\nDocument {i}:")
1009
+ # print(doc.content)
1010
 
1011
+ # # Print query understanding info
1012
+ # print("\nQuery Understanding:")
1013
+ # for key, value in result['query_info'].items():
1014
+ # print(f"{key}: {value}")
1015
+
1016
+ print("=" * 80)
1017
 
1018
+ except Exception as e:
1019
+ logger.error(f"Pipeline execution failed: {str(e)}")
1020
+ raise
1021
 
1022
+ if __name__ == "__main__":
1023
+ main()