JirasakJo commited on
Commit
ba89e8e
·
verified ·
1 Parent(s): cf9045a

Update calendar_rag.py

Browse files
Files changed (1) hide show
  1. calendar_rag.py +917 -487
calendar_rag.py CHANGED
@@ -2,30 +2,123 @@ from haystack import *
2
  from haystack.components.generators.openai import OpenAIGenerator
3
  from haystack.components.builders import PromptBuilder
4
  from haystack.components.embedders import SentenceTransformersDocumentEmbedder
5
- from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
6
  from haystack.document_stores.in_memory import InMemoryDocumentStore
7
  from haystack.utils import Secret
8
- from tenacity import retry, stop_after_attempt, wait_exponential
9
  from pathlib import Path
10
  import hashlib
11
  from datetime import *
12
  from typing import *
13
- import numpy as np
14
- from sklearn.metrics.pairwise import cosine_similarity
15
- from rouge_score import rouge_scorer
16
- import pandas as pd
17
  from dataclasses import *
18
  import json
19
  import logging
20
- import os
21
  import re
22
  import pickle
23
 
24
  # Setup logging
25
  logging.basicConfig(level=logging.INFO)
26
-
27
  logger = logging.getLogger(__name__)
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  class OpenAIDateParser:
30
  """Uses OpenAI to parse complex Thai date formats"""
31
 
@@ -64,19 +157,14 @@ class OpenAIDateParser:
64
  async def parse_date(self, date_str: str) -> Dict[str, Union[str, bool]]:
65
  """Parse complex Thai date format using OpenAI"""
66
  try:
67
- # Build prompt
68
  result = self.prompt_builder.run(date=date_str)
69
-
70
- # Get OpenAI response
71
  response = await self.generator.arun(prompt=result["prompt"])
72
 
73
  if not response or not response.get("replies"):
74
  raise ValueError("Empty response from OpenAI")
75
 
76
- # Parse JSON response
77
  parsed = json.loads(response["replies"][0])
78
 
79
- # Validate the parsed dates
80
  for date_field in ['start_date', 'end_date']:
81
  if parsed.get(date_field):
82
  datetime.strptime(parsed[date_field], '%Y-%m-%d')
@@ -87,25 +175,10 @@ class OpenAIDateParser:
87
  logger.error(f"OpenAI date parsing failed for '{date_str}': {str(e)}")
88
  raise ValueError(f"Could not parse date: {date_str}")
89
 
90
- @dataclass
91
- class ValidationResult:
92
- """Stores the result of a validation check"""
93
- is_valid: bool
94
- errors: List[str]
95
- warnings: List[str]
96
- normalized_data: Dict[str, str]
97
-
98
  class ThaiTextPreprocessor:
99
  """Handles Thai text preprocessing and normalization"""
100
 
101
- # Thai character normalization mappings
102
- CHAR_MAP = {
103
- 'ํา': 'ำ', # Normalize sara am
104
- '์': '', # Remove yamakkan
105
- '–': '-', # Normalize dashes
106
- '—': '-',
107
- '٫': ',', # Normalize separators
108
- }
109
 
110
  @classmethod
111
  def normalize_thai_text(cls, text: str) -> str:
@@ -113,14 +186,11 @@ class ThaiTextPreprocessor:
113
  if not text:
114
  return text
115
 
116
- # Apply character mappings
117
  for old, new in cls.CHAR_MAP.items():
118
  text = text.replace(old, new)
119
 
120
- # Normalize whitespace
121
  text = re.sub(r'\s+', ' ', text.strip())
122
 
123
- # Normalize Thai numerals if present
124
  thai_digits = '๐๑๒๓๔๕๖๗๘๙'
125
  arabic_digits = '0123456789'
126
 
@@ -142,13 +212,11 @@ class CalendarEventValidator:
142
  warnings = []
143
  normalized_data = {}
144
 
145
- # Validate and normalize date using OpenAI
146
  if event.date:
147
  try:
148
  parsed_date = await self.date_parser.parse_date(event.date)
149
  normalized_data['date'] = parsed_date['start_date']
150
 
151
- # If it's a date range, store it in the note
152
  if parsed_date['is_range'] and parsed_date['end_date']:
153
  range_note = f"ถึงวันที่ {parsed_date['end_date']}"
154
  if event.note:
@@ -161,14 +229,12 @@ class CalendarEventValidator:
161
  else:
162
  errors.append("Date is required")
163
 
164
- # Validate time format if provided
165
  if event.time:
166
  time_pattern = r'^([01]?[0-9]|2[0-3]):([0-5][0-9])$'
167
  if not re.match(time_pattern, event.time):
168
  errors.append(f"Invalid time format: {event.time}")
169
  normalized_data['time'] = event.time
170
 
171
- # Validate and normalize activity
172
  if event.activity:
173
  normalized_activity = self.preprocessor.normalize_thai_text(event.activity)
174
  if len(normalized_activity) < 3:
@@ -177,7 +243,6 @@ class CalendarEventValidator:
177
  else:
178
  errors.append("Activity is required")
179
 
180
- # Validate semester
181
  valid_semesters = {'ภาคต้น', 'ภาคปลาย', 'ภาคฤดูร้อน'}
182
  if event.semester:
183
  normalized_semester = self.preprocessor.normalize_thai_text(event.semester)
@@ -187,17 +252,14 @@ class CalendarEventValidator:
187
  else:
188
  errors.append("Semester is required")
189
 
190
- # Validate event type
191
  valid_types = {'registration', 'deadline', 'examination', 'academic', 'holiday'}
192
  if event.event_type not in valid_types:
193
  errors.append(f"Invalid event type: {event.event_type}")
194
  normalized_data['event_type'] = event.event_type
195
 
196
- # Normalize note if present and not already set by date range
197
  if event.note and 'note' not in normalized_data:
198
  normalized_data['note'] = self.preprocessor.normalize_thai_text(event.note)
199
 
200
- # Normalize section if present
201
  if event.section:
202
  normalized_data['section'] = self.preprocessor.normalize_thai_text(event.section)
203
 
@@ -208,11 +270,17 @@ class CalendarEventValidator:
208
  normalized_data=normalized_data
209
  )
210
 
211
- # Update CalendarEvent class to include async validation
212
  @dataclass
213
  class CalendarEvent:
214
  """Structured representation of a calendar event with validation"""
215
-
 
 
 
 
 
 
 
216
  @staticmethod
217
  def classify_event_type(activity: str) -> str:
218
  """Classify event type based on activity description"""
@@ -229,13 +297,6 @@ class CalendarEvent:
229
  if any(term in activity_lower for term in terms):
230
  return event_type
231
  return 'academic'
232
- date: str
233
- time: str
234
- activity: str
235
- note: str
236
- semester: str
237
- event_type: str
238
- section: Optional[str] = None
239
 
240
  async def initialize(self, openai_api_key: str):
241
  """Asynchronously validate and normalize the event"""
@@ -245,11 +306,9 @@ class CalendarEvent:
245
  if not result.is_valid:
246
  raise ValueError(f"Invalid calendar event: {', '.join(result.errors)}")
247
 
248
- # Update with normalized data
249
  for field, value in result.normalized_data.items():
250
  setattr(self, field, value)
251
-
252
- # Log any warnings
253
  if result.warnings:
254
  logger.warning(f"Calendar event warnings: {', '.join(result.warnings)}")
255
 
@@ -259,22 +318,18 @@ class CalendarEvent:
259
  ภาคการศึกษา: {self.semester}
260
  ประเภท: {self.event_type}
261
  วันที่: {self.date}
262
- เวลา: {self.time}
263
  กิจกรรม: {self.activity}
264
  หมวดหมู่: {self.section or '-'}
265
- หมายเหตุ: {self.note}
266
  """.strip()
267
-
268
  class CacheManager:
269
  """Manages caching for different components of the RAG pipeline"""
270
 
271
  def __init__(self, cache_dir: Path, ttl: int = 3600):
272
  """
273
  Initialize CacheManager
274
-
275
- Args:
276
- cache_dir: Directory to store cache files
277
- ttl: Time-to-live in seconds for cache entries (default: 1 hour)
278
  """
279
  self.cache_dir = cache_dir
280
  self.ttl = ttl
@@ -297,7 +352,6 @@ class CacheManager:
297
  try:
298
  with open(cache_path, 'rb') as f:
299
  cache = pickle.load(f)
300
- # Clean expired entries
301
  self._clean_expired_entries(cache)
302
  return cache
303
  except Exception as e:
@@ -354,259 +408,435 @@ class CacheManager:
354
  self.query_cache[key] = (result, datetime.now())
355
  self._save_cache("queries", self.query_cache)
356
 
357
- def get_document_cache(self, doc_id: str) -> Optional[Any]:
358
- """Get cached document"""
359
- if doc_id in self.document_cache:
360
- doc, timestamp = self.document_cache[doc_id]
361
- if datetime.now() - timestamp <= timedelta(seconds=self.ttl):
362
- return doc
363
- return None
364
-
365
  def set_document_cache(self, doc_id: str, document: Any):
366
  """Cache document"""
367
  self.document_cache[doc_id] = (document, datetime.now())
368
  self._save_cache("documents", self.document_cache)
369
-
370
- def clear_cache(self, cache_type: Optional[str] = None):
371
- """Clear specific or all caches"""
372
- if cache_type == "embeddings":
373
- self.embeddings_cache.clear()
374
- self._save_cache("embeddings", self.embeddings_cache)
375
- elif cache_type == "queries":
376
- self.query_cache.clear()
377
- self._save_cache("queries", self.query_cache)
378
- elif cache_type == "documents":
379
- self.document_cache.clear()
380
- self._save_cache("documents", self.document_cache)
381
- else:
382
- self.embeddings_cache.clear()
383
- self.query_cache.clear()
384
- self.document_cache.clear()
385
- for cache_type in ["embeddings", "queries", "documents"]:
386
- self._save_cache(cache_type, {})
387
 
388
  @dataclass
389
  class ModelConfig:
390
- """Configuration for language models and embeddings"""
391
  openai_api_key: str
392
  embedder_model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
393
  openai_model: str = "gpt-4o"
394
  temperature: float = 0.7
395
- max_tokens: int = 2000
396
- top_p: float = 0.95
397
- frequency_penalty: float = 0.0
398
- presence_penalty: float = 0.0
399
 
400
  @dataclass
401
  class RetrieverConfig:
402
- """Configuration for document retrieval"""
403
  top_k: int = 5
404
- similarity_threshold: float = 0.7
405
- reranking_enabled: bool = False
406
- reranking_model: Optional[str] = None
407
- filter_duplicates: bool = True
408
- min_document_length: int = 10
409
 
410
  @dataclass
411
  class CacheConfig:
412
- """Configuration for caching behavior"""
413
  enabled: bool = True
414
- cache_dir: Path = field(default_factory=lambda: Path("./cache"))
415
- embeddings_cache_ttl: int = 86400 # 24 hours
416
- query_cache_ttl: int = 3600 # 1 hour
417
- max_cache_size: int = 1000 # entries
418
- cache_cleanup_interval: int = 3600 # 1 hour
419
 
420
  @dataclass
421
  class ProcessingConfig:
422
- """Configuration for data processing"""
423
  batch_size: int = 32
424
- max_retries: int = 3
425
- timeout: int = 30
426
- max_concurrent_requests: int = 5
427
- chunk_size: int = 512
428
- chunk_overlap: int = 50
429
- preprocessing_workers: int = 4
430
-
431
- @dataclass
432
- class MonitoringConfig:
433
- """Configuration for monitoring and logging"""
434
- enable_monitoring: bool = True
435
- log_level: str = "INFO"
436
- metrics_enabled: bool = True
437
- trace_enabled: bool = True
438
- performance_logging: bool = True
439
- slow_query_threshold: float = 5.0 # seconds
440
- health_check_interval: int = 300 # 5 minutes
441
 
442
  @dataclass
443
  class LocalizationConfig:
444
- """Configuration for Thai language handling"""
445
- thai_tokenizer_model: str = "thai-tokenizer"
446
  enable_thai_normalization: bool = True
447
- remove_thai_tones: bool = False
448
- keep_english: bool = True
449
- custom_stopwords: List[str] = field(default_factory=list)
450
- custom_synonyms: Dict[str, List[str]] = field(default_factory=dict)
451
-
452
  @dataclass
453
  class PipelineConfig:
454
- """Main configuration for the RAG pipeline"""
455
- # Model configurations
456
  model: ModelConfig
457
-
458
- # Retriever settings
459
  retriever: RetrieverConfig = field(default_factory=RetrieverConfig)
460
-
461
- # Cache settings
462
  cache: CacheConfig = field(default_factory=CacheConfig)
463
-
464
- # Processing settings
465
  processing: ProcessingConfig = field(default_factory=ProcessingConfig)
466
-
467
- # Monitoring settings
468
- monitoring: MonitoringConfig = field(default_factory=MonitoringConfig)
469
-
470
- # Localization settings
471
  localization: LocalizationConfig = field(default_factory=LocalizationConfig)
472
-
473
- # Rate limiting
474
- rate_limit_enabled: bool = True
475
- requests_per_minute: int = 60
476
-
477
- # System settings
478
- debug_mode: bool = False
479
- development_mode: bool = False
480
-
481
- def __post_init__(self):
482
- """Validate configuration and create necessary directories"""
483
- if not self.model.openai_api_key:
484
- raise ValueError("OpenAI API key is required")
485
-
486
- if self.cache.enabled:
487
- self.cache.cache_dir.mkdir(parents=True, exist_ok=True)
488
-
489
- def to_dict(self) -> Dict[str, Any]:
490
- """Convert configuration to dictionary format"""
491
- return {
492
- "model_config": {
493
- "embedder_model": self.model.embedder_model,
494
- "openai_model": self.model.openai_model,
495
- "temperature": self.model.temperature,
496
- # Add other relevant fields
497
- },
498
- "retriever_config": {
499
- "top_k": self.retriever.top_k,
500
- "similarity_threshold": self.retriever.similarity_threshold,
501
- # Add other relevant fields
502
- },
503
- # Add other configuration sections
504
- }
505
-
506
- @classmethod
507
- def from_dict(cls, config_dict: Dict[str, Any]) -> 'PipelineConfig':
508
- """Create configuration from dictionary"""
509
- model_config = ModelConfig(**config_dict.get("model_config", {}))
510
- retriever_config = RetrieverConfig(**config_dict.get("retriever_config", {}))
511
- # Create other config objects
512
-
513
- return cls(
514
- model=model_config,
515
- retriever=retriever_config,
516
- # Add other configuration objects
517
- )
518
 
519
  def create_default_config(api_key: str) -> PipelineConfig:
520
- """Create a default configuration with the given API key"""
521
- model_config = ModelConfig(
522
- openai_api_key=api_key,
523
- embedder_model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
524
- )
525
- return PipelineConfig(
526
- model=model_config,
527
- retriever=RetrieverConfig(),
528
- cache=CacheConfig(),
529
- processing=ProcessingConfig(),
530
- monitoring=MonitoringConfig(),
531
- localization=LocalizationConfig()
532
- )
533
 
534
  class CalendarDataProcessor:
535
- """Process and structure calendar data"""
536
 
537
  @staticmethod
538
- def parse_calendar_json(json_data: List[Dict]) -> List[CalendarEvent]:
 
539
  events = []
540
 
541
- for semester_data in json_data:
542
- semester = semester_data['education']
 
 
 
 
543
 
544
- # Process regular schedule events
545
- for event in semester_data.get('schedule', []):
546
- # Check if this is a regular event or a section with details
547
  if 'section' in event and 'details' in event:
548
- # This is a section with details
549
  section = event['section']
550
  for detail in event['details']:
551
- # Extract semester-specific information if available
552
  if 'ภาคต้น' in detail and 'ภาคปลาย' in detail:
553
- # Handle both semesters
554
- semesters = ['ภาคต้น', 'ภาคปลาย']
555
- for sem in semesters:
556
- events.append(CalendarEvent(
557
- date=detail.get(sem, ''),
558
- time='',
559
- activity=detail.get('title', ''),
560
- note=section,
561
- semester=sem,
562
- event_type='deadline',
563
- section=section
564
- ))
565
  else:
566
- # Single event
567
  events.append(CalendarEvent(
568
  date=detail.get('date', ''),
569
  time='',
570
  activity=detail.get('title', ''),
571
  note=section,
572
- semester=semester,
573
  event_type='deadline',
574
  section=section
575
  ))
576
  else:
577
- # This is a regular event
578
  event_type = CalendarEvent.classify_event_type(event.get('activity', ''))
 
 
 
 
 
 
 
 
 
579
  events.append(CalendarEvent(
580
  date=event.get('date', ''),
581
  time=event.get('time', ''),
582
  activity=event.get('activity', ''),
583
  note=event.get('note', ''),
584
- semester=semester,
585
  event_type=event_type
586
  ))
587
 
588
  return events
589
 
590
- # Update the EnhancedDocumentStore class to use caching
591
- class EnhancedDocumentStore:
592
- """Enhanced document store with caching capabilities"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
593
 
594
  def __init__(self, config: PipelineConfig):
595
  self.store = InMemoryDocumentStore()
596
  self.embedder = SentenceTransformersDocumentEmbedder(
597
  model=config.model.embedder_model
598
  )
 
 
 
 
 
 
 
 
 
 
599
  self.cache_manager = CacheManager(
600
  cache_dir=config.cache.cache_dir,
601
- ttl=config.cache.embeddings_cache_ttl
602
  )
603
 
604
- # Configure for Thai text
605
  self.embedder.warm_up()
606
 
 
607
  self.events = []
608
  self.event_type_index = {}
609
  self.semester_index = {}
 
 
 
 
 
 
 
 
 
 
 
 
610
 
611
  def _compute_embedding(self, text: str) -> Any:
612
  """Compute embedding with caching"""
@@ -618,89 +848,269 @@ class EnhancedDocumentStore:
618
  embedding = self.embedder.run(documents=[doc])["documents"][0].embedding
619
  self.cache_manager.set_embedding_cache(text, embedding)
620
  return embedding
621
-
622
- def add_events(self, events: List[CalendarEvent]):
623
- """Add events with caching"""
624
- documents = []
625
-
626
- for event in events:
627
- # Store event
628
- self.events.append(event)
629
- event_idx = len(self.events) - 1
630
-
631
- # Update indices
632
- if event.event_type not in self.event_type_index:
633
- self.event_type_index[event.event_type] = []
634
- self.event_type_index[event.event_type].append(event_idx)
635
-
636
- if event.semester not in self.semester_index:
637
- self.semester_index[event.semester] = []
638
- self.semester_index[event.semester].append(event_idx)
639
-
640
- # Create document with cached embedding
641
- text = event.to_searchable_text()
642
  embedding = self._compute_embedding(text)
643
 
 
644
  doc = Document(
 
645
  content=text,
646
  embedding=embedding,
647
- meta={
648
- 'event_type': event.event_type,
649
- 'semester': event.semester,
650
- 'date': event.date
651
- }
652
  )
653
- documents.append(doc)
 
 
654
 
655
  # Cache document
656
- self.cache_manager.set_document_cache(str(event_idx), doc)
657
-
658
- # Store documents
659
- self.store.write_documents(documents)
660
-
661
- def search(self,
662
- query: str,
663
- event_type: Optional[str] = None,
664
- semester: Optional[str] = None,
665
- top_k: int = 5) -> List[Document]:
666
- """Search with query caching"""
667
- # Check cache first
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
668
  cache_key = json.dumps({
669
  'query': query,
670
  'event_type': event_type,
671
  'semester': semester,
672
- 'top_k': top_k
 
673
  })
 
674
  cached_results = self.cache_manager.get_query_cache(cache_key)
675
  if cached_results is not None:
676
  return cached_results
677
-
678
- # Compute query embedding
679
  query_embedding = self._compute_embedding(query)
 
 
 
680
 
681
- # Perform search
682
- retriever = InMemoryEmbeddingRetriever(
683
- document_store=self.store,
684
- top_k=top_k * 2
685
- )
686
 
687
- results = retriever.run(query_embedding=query_embedding)["documents"]
 
 
 
 
 
 
688
 
689
- # Filter results
690
  filtered_results = []
691
- for doc in results:
692
- if event_type and doc.meta['event_type'] != event_type:
693
  continue
694
- if semester and doc.meta['semester'] != semester:
695
  continue
696
  filtered_results.append(doc)
697
 
698
  final_results = filtered_results[:top_k]
699
-
700
- # Cache results
701
  self.cache_manager.set_query_cache(cache_key, final_results)
702
 
703
  return final_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
704
 
705
  class AdvancedQueryProcessor:
706
  """Process queries with better understanding"""
@@ -712,58 +1122,23 @@ class AdvancedQueryProcessor:
712
  )
713
  self.prompt_builder = PromptBuilder(
714
  template="""
715
- Analyze this academic calendar query (in Thai):
716
- Query: {{query}}
717
 
718
- Determine:
719
- 1. The type of information being requested
720
- 2. Any specific semester mentioned
721
- 3. Key terms to look for
722
 
723
- Return as JSON:
724
  {
725
- "event_type": "registration|deadline|examination|academic|holiday",
726
- "semester": "term mentioned or null",
727
- "key_terms": ["up to 3 most important terms"],
728
- "response_format": "list|single|detailed"
729
  }
730
- """)
731
-
732
- def process_query(self, query: str) -> Dict[str, Any]:
733
- """Process and analyze query"""
734
- try:
735
- # Get analysis
736
- result = self.prompt_builder.run(query=query)
737
- response = self.generator.run(prompt=result["prompt"])
738
-
739
- # Add validation for empty response
740
- if not response or not response.get("replies") or not response["replies"][0]:
741
- logger.warning("Received empty response from generator")
742
- return self._get_default_analysis(query)
743
-
744
- try:
745
- # Parse response with error handling
746
- analysis = json.loads(response["replies"][0])
747
-
748
- # Validate required fields
749
- required_fields = ["event_type", "semester", "key_terms", "response_format"]
750
- for field in required_fields:
751
- if field not in analysis:
752
- logger.warning(f"Missing required field: {field}")
753
- return self._get_default_analysis(query)
754
-
755
- return {
756
- "original_query": query,
757
- **analysis
758
- }
759
-
760
- except json.JSONDecodeError as je:
761
- logger.error(f"JSON parsing failed: {str(je)}")
762
- return self._get_default_analysis(query)
763
-
764
- except Exception as e:
765
- logger.error(f"Query processing failed: {str(e)}")
766
- return self._get_default_analysis(query)
767
 
768
  def _get_default_analysis(self, query: str) -> Dict[str, Any]:
769
  """Return default analysis when processing fails"""
@@ -775,98 +1150,41 @@ class AdvancedQueryProcessor:
775
  "key_terms": [],
776
  "response_format": "detailed"
777
  }
778
-
779
- @dataclass
780
- class RateLimitConfig:
781
- """Configuration for rate limiting"""
782
- requests_per_minute: int = 60
783
- max_retries: int = 3
784
- base_delay: float = 1.0
785
- max_delay: float = 60.0
786
- timeout: float = 30.0
787
- concurrent_requests: int = 5
788
-
789
- class APIError(Exception):
790
- """Base class for API related errors"""
791
- def __init__(self, message: str, status_code: Optional[int] = None, response: Optional[Dict] = None):
792
- super().__init__(message)
793
- self.status_code = status_code
794
- self.response = response
795
-
796
- class RateLimitExceededError(APIError):
797
- """Raised when rate limit is exceeded"""
798
- pass
799
-
800
- class OpenAIRateLimiter:
801
- """Rate limiter with advanced error handling for OpenAI API"""
802
-
803
- def __init__(self, config: RateLimitConfig):
804
- self.config = config
805
- self.requests = deque(maxlen=config.requests_per_minute)
806
- self.semaphore = asyncio.Semaphore(config.concurrent_requests)
807
- self.total_requests = 0
808
- self.errors = deque(maxlen=1000) # Store recent errors
809
- self.start_time = datetime.now()
810
-
811
- async def acquire(self):
812
- """Acquire permission to make a request"""
813
- now = time.time()
814
-
815
- # Clean old requests
816
- while self.requests and self.requests[0] < now - 60:
817
- self.requests.popleft()
818
-
819
- # Check if we're at the limit
820
- if len(self.requests) >= self.config.requests_per_minute:
821
- wait_time = 60 - (now - self.requests[0])
822
- logger.warning(f"Rate limit reached. Waiting {wait_time:.2f} seconds")
823
- await asyncio.sleep(wait_time)
824
-
825
- # Add new request timestamp
826
- self.requests.append(now)
827
- self.total_requests += 1
828
-
829
- def get_usage_stats(self) -> Dict[str, Any]:
830
- """Get current usage statistics"""
831
- return {
832
- "total_requests": self.total_requests,
833
- "current_rpm": len(self.requests),
834
- "uptime": (datetime.now() - self.start_time).total_seconds(),
835
- "error_rate": len(self.errors) / self.total_requests if self.total_requests > 0 else 0
836
- }
837
-
838
- @retry(
839
- stop=stop_after_attempt(3),
840
- wait=wait_exponential(multiplier=1, min=4, max=60),
841
- reraise=True
842
- )
843
- async def execute_with_retry(self, func, *args, **kwargs):
844
- """Execute API call with retry logic"""
845
  try:
846
- async with self.semaphore:
847
- await self.acquire()
848
- return await func(*args, **kwargs)
849
-
850
- except Exception as e:
851
- error_info = {
852
- "timestamp": datetime.now(),
853
- "error_type": type(e).__name__,
854
- "message": str(e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
855
  }
856
- self.errors.append(error_info)
857
-
858
- if isinstance(e, RateLimitExceededError):
859
- logger.warning("Rate limit exceeded, backing off...")
860
- await asyncio.sleep(self.config.base_delay)
861
- raise
862
-
863
- elif "timeout" in str(e).lower():
864
- logger.error(f"Timeout error: {str(e)}")
865
- raise APIError(f"Request timed out after {self.config.timeout} seconds")
866
-
867
- else:
868
- logger.error(f"API error: {str(e)}")
869
- raise
870
 
871
  class ResponseGenerator:
872
  """Generate responses with better context utilization"""
@@ -878,27 +1196,24 @@ class ResponseGenerator:
878
  )
879
  self.prompt_builder = PromptBuilder(
880
  template="""
881
- You are a helpful academic advisor. Answer the following query using the provided calendar information.
882
-
883
- Query: {{query}}
884
-
885
- Relevant Calendar Information:
886
  {% for doc in context %}
887
- ---
888
  {{doc.content}}
889
  {% endfor %}
890
 
891
- Format: {{format}}
892
-
893
- Guidelines:
894
- 1. Answer in Thai language
895
- 2. Be specific about dates and requirements
896
- 3. Include relevant notes or conditions
897
- 4. Format the response according to the specified format
898
-
899
- Provide your response:
900
- """)
901
-
902
  def generate_response(self,
903
  query: str,
904
  documents: List[Document],
@@ -919,34 +1234,164 @@ class ResponseGenerator:
919
  return "ขออภัย ไม่สามารถประมวลผลคำตอบได้ในขณะนี้"
920
 
921
  class AcademicCalendarRAG:
922
- """Main RAG pipeline for academic calendar queries"""
923
 
924
  def __init__(self, config: PipelineConfig):
925
  self.config = config
926
- self.document_store = EnhancedDocumentStore(config)
927
  self.query_processor = AdvancedQueryProcessor(config)
928
  self.response_generator = ResponseGenerator(config)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
929
 
930
- def load_data(self, json_data: List[Dict]):
931
- """Load and process calendar data"""
932
- processor = CalendarDataProcessor()
933
- events = processor.parse_calendar_json(json_data)
934
- self.document_store.add_events(events)
 
 
 
 
 
 
 
935
 
936
- def process_query(self, query: str) -> Dict[str, Any]:
937
- """Process query and generate response"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
938
  try:
939
  # Analyze query
940
  query_info = self.query_processor.process_query(query)
941
 
942
- # Retrieve relevant documents
943
- documents = self.document_store.search(
944
  query=query,
945
- event_type=query_info["event_type"],
946
- semester=query_info["semester"],
947
- top_k=self.config.retriever.top_k
 
948
  )
949
 
 
 
 
 
 
950
  # Generate response
951
  response = self.response_generator.generate_response(
952
  query=query,
@@ -955,65 +1400,50 @@ class AcademicCalendarRAG:
955
  )
956
 
957
  return {
 
958
  "answer": response,
959
- "documents": documents,
960
  "query_info": query_info
961
  }
962
 
963
  except Exception as e:
964
- logger.error(f"Query processing failed: {str(e)}")
965
  return {
966
- "answer": "ขออภัย ไม่สามารถประมวลผลคำถามได้ในขณะนี้",
967
- "documents": [],
968
- "query_info": {}
969
  }
970
-
971
  # def main():
972
- # """Main function for processing real calendar queries"""
973
  # try:
974
  # # Load API key
975
  # with open("key.txt", "r") as f:
976
  # openai_api_key = f.read().strip()
977
 
978
- # # Use create_default_config instead of direct PipelineConfig initialization
979
  # config = create_default_config(openai_api_key)
980
-
981
- # # Customize config for Thai academic calendar use case
982
  # config.localization.enable_thai_normalization = True
983
- # config.retriever.top_k = 5 # Adjust based on your needs
984
- # config.model.temperature = 0.3 # Lower temperature for more focused responses
985
 
986
- # # Initialize pipeline with enhanced config
987
  # pipeline = AcademicCalendarRAG(config)
988
 
989
- # # Load calendar data
990
- # with open("calendar.json", "r", encoding="utf-8") as f:
991
- # calendar_data = json.load(f)
992
- # pipeline.load_data(calendar_data)
993
 
994
- # # Real queries to process
995
- # queries = ["นิสิตที่เข้าศึกษาในภาคเรียนที่ 1 ปีการศึกษา 2567 สามารถถอนรายวิชาได้หรือไม���? เพราะเหตุใด?"]
 
 
996
 
997
- # print("Processing calendar queries...")
998
  # print("=" * 80)
999
 
1000
  # for query in queries:
1001
- # result = pipeline.process_query(query)
1002
  # print(f"\nQuery: {query}")
 
1003
  # print(f"Answer: {result['answer']}")
1004
-
1005
- # # # Print retrieved documents for verification
1006
- # # print("\nRetrieved Documents:")
1007
- # # for i, doc in enumerate(result['documents'], 1):
1008
- # # print(f"\nDocument {i}:")
1009
- # # print(doc.content)
1010
-
1011
- # # # Print query understanding info
1012
- # # print("\nQuery Understanding:")
1013
- # # for key, value in result['query_info'].items():
1014
- # # print(f"{key}: {value}")
1015
-
1016
- # print("=" * 80)
1017
 
1018
  # except Exception as e:
1019
  # logger.error(f"Pipeline execution failed: {str(e)}")
 
2
  from haystack.components.generators.openai import OpenAIGenerator
3
  from haystack.components.builders import PromptBuilder
4
  from haystack.components.embedders import SentenceTransformersDocumentEmbedder
5
+ from haystack.components.retrievers.in_memory import *
6
  from haystack.document_stores.in_memory import InMemoryDocumentStore
7
  from haystack.utils import Secret
 
8
  from pathlib import Path
9
  import hashlib
10
  from datetime import *
11
  from typing import *
 
 
 
 
12
  from dataclasses import *
13
  import json
14
  import logging
 
15
  import re
16
  import pickle
17
 
18
  # Setup logging
19
  logging.basicConfig(level=logging.INFO)
 
20
  logger = logging.getLogger(__name__)
21
 
22
+ @dataclass
23
+ class ValidationResult:
24
+ """Stores the result of a validation check"""
25
+ is_valid: bool
26
+ errors: List[str]
27
+ warnings: List[str]
28
+ normalized_data: Dict[str, str]
29
+
30
+ @dataclass
31
+ class ApplicationInfo:
32
+ application_portal: str
33
+ program_email: str
34
+
35
+ @dataclass
36
+ class RequiredDocument:
37
+ name: str
38
+ description: str
39
+ conditions: Optional[str] = None
40
+
41
+ @dataclass
42
+ class SelectionStep:
43
+ step_number: str
44
+ description: str
45
+
46
+ @dataclass
47
+ class ProgramDetailInfo:
48
+ application_info: ApplicationInfo
49
+ required_documents: Dict[str, Dict[str, RequiredDocument]]
50
+ submission_process: str
51
+ selection_process: List[SelectionStep]
52
+
53
+ @dataclass
54
+ class Transportation:
55
+ boat: str
56
+ bts: str
57
+ mrt: str
58
+ airport_link: str
59
+ bus: Dict[str, str]
60
+
61
+ @dataclass
62
+ class Contact:
63
+ email: str
64
+ facebook: Dict[str, str]
65
+
66
+ @dataclass
67
+ class ContactDetail:
68
+ event_type: str
69
+ department: str
70
+ faculty: str
71
+ university: str
72
+ location: str
73
+ contact: Contact
74
+ transportation: Transportation
75
+
76
+ @dataclass
77
+ class Course:
78
+ code: str
79
+ title_th: str
80
+ title_en: str
81
+ credits: int
82
+
83
+ @dataclass
84
+ class CourseCategory:
85
+ description: Optional[str]
86
+ credits: Union[str, int]
87
+ minimum_credits: Optional[int]
88
+ courses: List[Course]
89
+
90
+ @dataclass
91
+ class CourseStructure:
92
+ event_type: str
93
+ program_name: str
94
+ department: str
95
+ total_credits: int
96
+ degree_level: str
97
+ structure: Dict[str, CourseCategory]
98
+
99
+ @dataclass
100
+ class StudyPlan:
101
+ event_type: str
102
+ years: Dict[str, Dict[str, Any]]
103
+
104
+ @dataclass
105
+ class RegularFee:
106
+ amount: float
107
+ currency: str
108
+ period: str
109
+
110
+ @dataclass
111
+ class LatePaymentFee:
112
+ amount: float
113
+ currency: str
114
+
115
+ @dataclass
116
+ class TuitionFee:
117
+ event_type: str
118
+ regular_fee: RegularFee
119
+ late_payment_fee: LatePaymentFee
120
+
121
+
122
  class OpenAIDateParser:
123
  """Uses OpenAI to parse complex Thai date formats"""
124
 
 
157
  async def parse_date(self, date_str: str) -> Dict[str, Union[str, bool]]:
158
  """Parse complex Thai date format using OpenAI"""
159
  try:
 
160
  result = self.prompt_builder.run(date=date_str)
 
 
161
  response = await self.generator.arun(prompt=result["prompt"])
162
 
163
  if not response or not response.get("replies"):
164
  raise ValueError("Empty response from OpenAI")
165
 
 
166
  parsed = json.loads(response["replies"][0])
167
 
 
168
  for date_field in ['start_date', 'end_date']:
169
  if parsed.get(date_field):
170
  datetime.strptime(parsed[date_field], '%Y-%m-%d')
 
175
  logger.error(f"OpenAI date parsing failed for '{date_str}': {str(e)}")
176
  raise ValueError(f"Could not parse date: {date_str}")
177
 
 
 
 
 
 
 
 
 
178
  class ThaiTextPreprocessor:
179
  """Handles Thai text preprocessing and normalization"""
180
 
181
+ CHAR_MAP = {'ํา': 'ำ','์': '','–': '-','—': '-','٫': ',',}
 
 
 
 
 
 
 
182
 
183
  @classmethod
184
  def normalize_thai_text(cls, text: str) -> str:
 
186
  if not text:
187
  return text
188
 
 
189
  for old, new in cls.CHAR_MAP.items():
190
  text = text.replace(old, new)
191
 
 
192
  text = re.sub(r'\s+', ' ', text.strip())
193
 
 
194
  thai_digits = '๐๑๒๓๔๕๖๗๘๙'
195
  arabic_digits = '0123456789'
196
 
 
212
  warnings = []
213
  normalized_data = {}
214
 
 
215
  if event.date:
216
  try:
217
  parsed_date = await self.date_parser.parse_date(event.date)
218
  normalized_data['date'] = parsed_date['start_date']
219
 
 
220
  if parsed_date['is_range'] and parsed_date['end_date']:
221
  range_note = f"ถึงวันที่ {parsed_date['end_date']}"
222
  if event.note:
 
229
  else:
230
  errors.append("Date is required")
231
 
 
232
  if event.time:
233
  time_pattern = r'^([01]?[0-9]|2[0-3]):([0-5][0-9])$'
234
  if not re.match(time_pattern, event.time):
235
  errors.append(f"Invalid time format: {event.time}")
236
  normalized_data['time'] = event.time
237
 
 
238
  if event.activity:
239
  normalized_activity = self.preprocessor.normalize_thai_text(event.activity)
240
  if len(normalized_activity) < 3:
 
243
  else:
244
  errors.append("Activity is required")
245
 
 
246
  valid_semesters = {'ภาคต้น', 'ภาคปลาย', 'ภาคฤดูร้อน'}
247
  if event.semester:
248
  normalized_semester = self.preprocessor.normalize_thai_text(event.semester)
 
252
  else:
253
  errors.append("Semester is required")
254
 
 
255
  valid_types = {'registration', 'deadline', 'examination', 'academic', 'holiday'}
256
  if event.event_type not in valid_types:
257
  errors.append(f"Invalid event type: {event.event_type}")
258
  normalized_data['event_type'] = event.event_type
259
 
 
260
  if event.note and 'note' not in normalized_data:
261
  normalized_data['note'] = self.preprocessor.normalize_thai_text(event.note)
262
 
 
263
  if event.section:
264
  normalized_data['section'] = self.preprocessor.normalize_thai_text(event.section)
265
 
 
270
  normalized_data=normalized_data
271
  )
272
 
 
273
  @dataclass
274
  class CalendarEvent:
275
  """Structured representation of a calendar event with validation"""
276
+ date: str
277
+ time: str
278
+ activity: str
279
+ note: str
280
+ semester: str
281
+ event_type: str
282
+ section: Optional[str] = None
283
+
284
  @staticmethod
285
  def classify_event_type(activity: str) -> str:
286
  """Classify event type based on activity description"""
 
297
  if any(term in activity_lower for term in terms):
298
  return event_type
299
  return 'academic'
 
 
 
 
 
 
 
300
 
301
  async def initialize(self, openai_api_key: str):
302
  """Asynchronously validate and normalize the event"""
 
306
  if not result.is_valid:
307
  raise ValueError(f"Invalid calendar event: {', '.join(result.errors)}")
308
 
 
309
  for field, value in result.normalized_data.items():
310
  setattr(self, field, value)
311
+
 
312
  if result.warnings:
313
  logger.warning(f"Calendar event warnings: {', '.join(result.warnings)}")
314
 
 
318
  ภาคการศึกษา: {self.semester}
319
  ประเภท: {self.event_type}
320
  วันที่: {self.date}
321
+ เวลา: {self.time or '-'}
322
  กิจกรรม: {self.activity}
323
  หมวดหมู่: {self.section or '-'}
324
+ หมายเหตุ: {self.note or '-'}
325
  """.strip()
326
+
327
  class CacheManager:
328
  """Manages caching for different components of the RAG pipeline"""
329
 
330
  def __init__(self, cache_dir: Path, ttl: int = 3600):
331
  """
332
  Initialize CacheManager
 
 
 
 
333
  """
334
  self.cache_dir = cache_dir
335
  self.ttl = ttl
 
352
  try:
353
  with open(cache_path, 'rb') as f:
354
  cache = pickle.load(f)
 
355
  self._clean_expired_entries(cache)
356
  return cache
357
  except Exception as e:
 
408
  self.query_cache[key] = (result, datetime.now())
409
  self._save_cache("queries", self.query_cache)
410
 
 
 
 
 
 
 
 
 
411
  def set_document_cache(self, doc_id: str, document: Any):
412
  """Cache document"""
413
  self.document_cache[doc_id] = (document, datetime.now())
414
  self._save_cache("documents", self.document_cache)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415
 
416
  @dataclass
417
  class ModelConfig:
 
418
  openai_api_key: str
419
  embedder_model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
420
  openai_model: str = "gpt-4o"
421
  temperature: float = 0.7
 
 
 
 
422
 
423
  @dataclass
424
  class RetrieverConfig:
 
425
  top_k: int = 5
 
 
 
 
 
426
 
427
  @dataclass
428
  class CacheConfig:
 
429
  enabled: bool = True
430
+ cache_dir: Path = Path("./cache")
431
+ ttl: int = 86400 # 24 hours
 
 
 
432
 
433
  @dataclass
434
  class ProcessingConfig:
 
435
  batch_size: int = 32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
 
437
  @dataclass
438
  class LocalizationConfig:
 
 
439
  enable_thai_normalization: bool = True
440
+
 
 
 
 
441
  @dataclass
442
  class PipelineConfig:
 
 
443
  model: ModelConfig
 
 
444
  retriever: RetrieverConfig = field(default_factory=RetrieverConfig)
 
 
445
  cache: CacheConfig = field(default_factory=CacheConfig)
 
 
446
  processing: ProcessingConfig = field(default_factory=ProcessingConfig)
 
 
 
 
 
447
  localization: LocalizationConfig = field(default_factory=LocalizationConfig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
448
 
449
  def create_default_config(api_key: str) -> PipelineConfig:
450
+ return PipelineConfig(model=ModelConfig(openai_api_key=api_key))
 
 
 
 
 
 
 
 
 
 
 
 
451
 
452
  class CalendarDataProcessor:
453
+ """Process and structure calendar data from the new raw-data.json format"""
454
 
455
  @staticmethod
456
+ def parse_calendar_json(json_data: Dict) -> List[CalendarEvent]:
457
+ """Parse the new calendar JSON format into CalendarEvent objects"""
458
  events = []
459
 
460
+ # Extract academic calendar data - handle direct dictionary input
461
+ calendar_data = json_data.get('academic_calendar', []) if isinstance(json_data, dict) else json_data
462
+
463
+ for semester_block in calendar_data:
464
+ semester = semester_block.get('education', '')
465
+ schedule = semester_block.get('schedule', [])
466
 
467
+ # Handle regular schedule events
468
+ for event in schedule:
 
469
  if 'section' in event and 'details' in event:
470
+ # Process section-based events (thesis deadlines, etc.)
471
  section = event['section']
472
  for detail in event['details']:
 
473
  if 'ภาคต้น' in detail and 'ภาคปลาย' in detail:
474
+ # Handle dual-semester events
475
+ for sem_key in ['ภาคต้น', 'ภาคปลาย']:
476
+ if detail.get(sem_key):
477
+ events.append(CalendarEvent(
478
+ date=detail[sem_key],
479
+ time='',
480
+ activity=detail['title'],
481
+ note=section,
482
+ semester=sem_key,
483
+ event_type='deadline',
484
+ section=section
485
+ ))
486
  else:
487
+ # Single semester event
488
  events.append(CalendarEvent(
489
  date=detail.get('date', ''),
490
  time='',
491
  activity=detail.get('title', ''),
492
  note=section,
493
+ semester=ThaiTextPreprocessor.normalize_thai_text(semester),
494
  event_type='deadline',
495
  section=section
496
  ))
497
  else:
498
+ # Regular calendar event
499
  event_type = CalendarEvent.classify_event_type(event.get('activity', ''))
500
+
501
+ # Clean semester string
502
+ cleaned_semester = semester
503
+ if '(' in semester:
504
+ match = re.search(r'\((.*?)\)', semester)
505
+ if match:
506
+ cleaned_semester = match.group(1)
507
+ cleaned_semester = ThaiTextPreprocessor.normalize_thai_text(cleaned_semester)
508
+
509
  events.append(CalendarEvent(
510
  date=event.get('date', ''),
511
  time=event.get('time', ''),
512
  activity=event.get('activity', ''),
513
  note=event.get('note', ''),
514
+ semester=cleaned_semester,
515
  event_type=event_type
516
  ))
517
 
518
  return events
519
 
520
+ @staticmethod
521
+ def extract_program_details(json_data: Dict) -> ProgramDetailInfo:
522
+ """Extract and structure program details into ProgramDetailInfo object"""
523
+ raw_details = json_data.get('program_details', {})
524
+
525
+ # Process application info
526
+ app_info_data = raw_details.get('application_info', {})
527
+ app_info = ApplicationInfo(
528
+ application_portal=app_info_data.get('application_portal', ''),
529
+ program_email=app_info_data.get('program_email', '')
530
+ )
531
+
532
+ # Process required documents
533
+ req_docs = {}
534
+ raw_docs = raw_details.get('required_documents', {})
535
+
536
+ # Process mandatory documents
537
+ mandatory_docs = {}
538
+ for doc_key, doc_value in raw_docs.get('mandatory', {}).items():
539
+ mandatory_docs[doc_key] = RequiredDocument(
540
+ name=doc_key,
541
+ description=doc_value
542
+ )
543
+ req_docs['mandatory'] = mandatory_docs
544
+
545
+ # Process optional documents
546
+ optional_docs = {}
547
+ for doc_key, doc_data in raw_docs.get('optional', {}).items():
548
+ if doc_key == 'english_proficiency':
549
+ ep_data = doc_data
550
+ optional_docs[doc_key] = RequiredDocument(
551
+ name=ep_data.get('name', ''),
552
+ description=str(ep_data.get('accepted_tests', {})),
553
+ conditions=f"Validity: {ep_data.get('validity', '')}, Benefits: {ep_data.get('benefits', '')}, Exemptions: {ep_data.get('exemptions', '')}"
554
+ )
555
+ else:
556
+ optional_docs[doc_key] = RequiredDocument(
557
+ name=doc_data.get('name', ''),
558
+ description='',
559
+ conditions=doc_data.get('condition', '')
560
+ )
561
+ req_docs['optional'] = optional_docs
562
+
563
+ # Process selection steps
564
+ selection_steps = []
565
+ for step_data in raw_details.get('selection_process', {}).get('steps', []):
566
+ for step_num, description in step_data.items():
567
+ selection_steps.append(SelectionStep(
568
+ step_number=step_num,
569
+ description=description
570
+ ))
571
+
572
+ return [ProgramDetailInfo(
573
+ application_info=app_info,
574
+ required_documents=req_docs,
575
+ submission_process=raw_details.get('submission_process', ''),
576
+ selection_process=selection_steps
577
+ )]
578
+
579
+ @staticmethod
580
+ def extract_contact_details(json_data: Dict) -> List[ContactDetail]:
581
+ """Extract and structure contact details into ContactDetail objects"""
582
+ raw_contacts = json_data.get('contact_details', [])
583
+ contact_details = []
584
+
585
+ # Handle the case where raw_contacts might be a single object instead of a list
586
+ if not isinstance(raw_contacts, list):
587
+ raw_contacts = [raw_contacts]
588
+
589
+ for contact_data in raw_contacts:
590
+ # Skip if contact_data is not a dictionary
591
+ if not isinstance(contact_data, dict):
592
+ continue
593
+
594
+ try:
595
+ # Process transportation data
596
+ transportation_data = contact_data.get('transportation', {})
597
+ transportation = Transportation(
598
+ boat=transportation_data.get('boat', ''),
599
+ bts=transportation_data.get('bts', ''),
600
+ mrt=transportation_data.get('mrt', ''),
601
+ airport_link=transportation_data.get('airport_link', ''),
602
+ bus=transportation_data.get('bus', {})
603
+ )
604
+
605
+ # Process contact information
606
+ contact_info = Contact(
607
+ email=contact_data.get('email', ''),
608
+ facebook=contact_data.get('facebook', {})
609
+ )
610
+
611
+ # Create ContactDetail object
612
+ contact_details.append(ContactDetail(
613
+ event_type=contact_data.get('event_type', ''),
614
+ department=contact_data.get('department', ''),
615
+ faculty=contact_data.get('faculty', ''),
616
+ university=contact_data.get('university', ''),
617
+ location=contact_data.get('location', ''),
618
+ contact=contact_info,
619
+ transportation=transportation
620
+ ))
621
+ except Exception as e:
622
+ print(f"Error processing contact data: {e}")
623
+ continue
624
+
625
+ return contact_details
626
+
627
+
628
+ @staticmethod
629
+ def extract_course_structure(json_data: Dict) -> List[CourseStructure]:
630
+ """Extract and structure course information into CourseStructure objects"""
631
+ course_structures = []
632
+
633
+ # Get course structure data
634
+ course_data = json_data.get('course_structure', {})
635
+ program_metadata = course_data.get('program_metadata', {})
636
+ curriculum = course_data.get('curriculum_structure', {})
637
+
638
+ # Process foundation courses
639
+ foundation_data = curriculum.get('foundation_courses', {})
640
+ foundation_courses = []
641
+ for course in foundation_data.get('courses', []):
642
+ foundation_courses.append(Course(
643
+ code=course.get('code', ''),
644
+ title_th=course.get('title', {}).get('th', ''),
645
+ title_en=course.get('title', {}).get('en', ''),
646
+ credits=course.get('credits', 0)
647
+ ))
648
+
649
+ # Process core courses
650
+ core_data = curriculum.get('core_courses', {})
651
+ core_courses = []
652
+ for course in core_data.get('modules', []):
653
+ core_courses.append(Course(
654
+ code=course.get('code', ''),
655
+ title_th=course.get('title', {}).get('th', ''),
656
+ title_en=course.get('title', {}).get('en', ''),
657
+ credits=course.get('credits', 0)
658
+ ))
659
+
660
+ # Process elective courses
661
+ elective_data = curriculum.get('electives', {})
662
+ elective_courses = []
663
+ for course in elective_data.get('course_groups', []):
664
+ elective_courses.append(Course(
665
+ code=course.get('code', ''),
666
+ title_th=course.get('title', {}).get('th', ''),
667
+ title_en=course.get('title', {}).get('en', ''),
668
+ credits=course.get('credits', 0)
669
+ ))
670
+
671
+ # Process research courses
672
+ research_data = curriculum.get('research', {})
673
+ research_courses = []
674
+ for course in research_data.get('course', []):
675
+ research_courses.append(Course(
676
+ code=course.get('code', ''),
677
+ title_th=course.get('title', {}).get('th', ''),
678
+ title_en=course.get('title', {}).get('en', ''),
679
+ credits=course.get('credits', 0)
680
+ ))
681
+
682
+ # Create course categories
683
+ structure = {
684
+ 'หมวดวิชาปรับพื้นฐาน': CourseCategory( # Previously foundation_courses
685
+ description=foundation_data.get('metadata', {}).get('description'),
686
+ credits=foundation_data.get('metadata', {}).get('credits', 'non-credit'),
687
+ minimum_credits=None,
688
+ courses=foundation_courses
689
+ ),
690
+ 'หมวดวิชาบังคับ': CourseCategory( # Previously core_courses
691
+ description=None,
692
+ credits=0,
693
+ minimum_credits=core_data.get('minimum_requirement_credits'),
694
+ courses=core_courses
695
+ ),
696
+ 'หมวดวิชาเลือก': CourseCategory( # Previously elective_courses
697
+ description=None,
698
+ credits=0,
699
+ minimum_credits=elective_data.get('minimum_requirement_credits'),
700
+ courses=elective_courses
701
+ ),
702
+ 'หมวดวิชาการค้นคว้าอิสระ': CourseCategory( # Previously research_courses
703
+ description=None,
704
+ credits=0,
705
+ minimum_credits=research_data.get('minimum_requirement_credits'),
706
+ courses=research_courses
707
+ )
708
+ }
709
+
710
+ # Create course structure
711
+ course_structure = CourseStructure(
712
+ event_type='curriculum_structure',
713
+ program_name=program_metadata.get('name', ''),
714
+ department=program_metadata.get('department', ''),
715
+ total_credits=program_metadata.get('total_credits', 0),
716
+ degree_level=program_metadata.get('degree_level', ''),
717
+ structure=structure
718
+ )
719
+
720
+ return [course_structure]
721
+
722
+ @staticmethod
723
+ def extract_program_study_plan(json_data: Dict) -> List[StudyPlan]:
724
+ """Extract and structure study plan information into StudyPlan objects"""
725
+ study_plan_data = json_data.get('program_study_plan', {})
726
+
727
+ # Initialize the years dictionary to store all year/semester data
728
+ years_dict = {}
729
+
730
+ for year_key, year_data in study_plan_data.items():
731
+ years_dict[year_key] = {}
732
+
733
+ for semester_key, semester_data in year_data.items():
734
+ # Get metadata
735
+ metadata = semester_data.get('metadata', {})
736
+
737
+ # Initialize semester structure
738
+ semester_struct = {
739
+ 'metadata': metadata,
740
+ 'courses': []
741
+ }
742
+
743
+ # Handle both 'modules' and 'courses' keys
744
+ course_data = semester_data.get('modules', []) or semester_data.get('courses', [])
745
+
746
+ # Add courses to semester
747
+ for course in course_data:
748
+ course_info = {
749
+ 'code': course.get('code', ''),
750
+ 'title': course.get('title', {'th': '', 'en': ''}),
751
+ 'credits': course.get('credits', 0)
752
+ }
753
+ semester_struct['courses'].append(course_info)
754
+
755
+ # Add semester data to year
756
+ years_dict[year_key][semester_key] = semester_struct
757
+
758
+ # Create StudyPlan object
759
+ study_plan = StudyPlan(
760
+ event_type='study_plan',
761
+ years=years_dict
762
+ )
763
+
764
+ return [study_plan]
765
+
766
+ @staticmethod
767
+ def extract_fees(json_data: Dict) -> List[TuitionFee]:
768
+ """Extract and structure fee information into TuitionFee objects"""
769
+ fees_data = json_data.get('fees', {})
770
+
771
+ # Parse regular tuition fee
772
+ regular_fee_str = fees_data.get('tuition', '')
773
+ regular_amount = float(regular_fee_str.split()[0]) if regular_fee_str else 0
774
+
775
+ regular_fee = RegularFee(
776
+ amount=regular_amount,
777
+ currency='THB',
778
+ period='per semester'
779
+ )
780
+
781
+ # Parse late payment fee
782
+ late_fee_str = fees_data.get('late_payment', '')
783
+ late_amount = float(late_fee_str.split()[0]) if late_fee_str else 0
784
+
785
+ late_payment_fee = LatePaymentFee(
786
+ amount=late_amount,
787
+ currency='THB'
788
+ )
789
+
790
+ # Create TuitionFee object
791
+ tuition_fee = TuitionFee(
792
+ event_type='tuition_fee',
793
+ regular_fee=regular_fee,
794
+ late_payment_fee=late_payment_fee
795
+ )
796
+
797
+ return [tuition_fee]
798
+
799
+ class HybridDocumentStore:
800
+ """Enhanced document store with hybrid retrieval capabilities"""
801
 
802
  def __init__(self, config: PipelineConfig):
803
  self.store = InMemoryDocumentStore()
804
  self.embedder = SentenceTransformersDocumentEmbedder(
805
  model=config.model.embedder_model
806
  )
807
+ # Initialize BM25 retriever
808
+ self.bm25_retriever = InMemoryBM25Retriever(
809
+ document_store=self.store,
810
+ top_k=config.retriever.top_k
811
+ )
812
+ # Initialize embedding retriever
813
+ self.embedding_retriever = InMemoryEmbeddingRetriever(
814
+ document_store=self.store,
815
+ top_k=config.retriever.top_k
816
+ )
817
  self.cache_manager = CacheManager(
818
  cache_dir=config.cache.cache_dir,
819
+ ttl=config.cache.ttl
820
  )
821
 
 
822
  self.embedder.warm_up()
823
 
824
+ # Initialize containers
825
  self.events = []
826
  self.event_type_index = {}
827
  self.semester_index = {}
828
+ self._document_counter = 0
829
+
830
+ # Additional data containers
831
+ self.course_data = []
832
+ self.contact_data = []
833
+ self.study_plan_data = []
834
+
835
+ def _generate_unique_id(self) -> str:
836
+ """Generate a unique document ID"""
837
+ self._document_counter += 1
838
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
839
+ return f"doc_{timestamp}_{self._document_counter}"
840
 
841
  def _compute_embedding(self, text: str) -> Any:
842
  """Compute embedding with caching"""
 
848
  embedding = self.embedder.run(documents=[doc])["documents"][0].embedding
849
  self.cache_manager.set_embedding_cache(text, embedding)
850
  return embedding
851
+
852
+ def add_document(self, text: str, event_type: str):
853
+ """Add a single document to the store"""
854
+ try:
855
+ # Compute embedding
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
856
  embedding = self._compute_embedding(text)
857
 
858
+ # Create document with unique ID
859
  doc = Document(
860
+ id=self._generate_unique_id(),
861
  content=text,
862
  embedding=embedding,
863
+ meta={'event_type': event_type}
 
 
 
 
864
  )
865
+
866
+ # Write document
867
+ self.store.write_documents([doc])
868
 
869
  # Cache document
870
+ self.cache_manager.set_document_cache(doc.id, doc)
871
+
872
+ except Exception as e:
873
+ logger.error(f"Error adding document: {str(e)}")
874
+ raise
875
+
876
+ def add_events(self, events: List[CalendarEvent], contact_details: Optional[List[ContactDetail]] = None,
877
+ course_structure: Optional[List[CourseStructure]] = None,
878
+ study_plans: Optional[List[StudyPlan]] = None):
879
+ """Add events and additional data with caching"""
880
+ documents = []
881
+ added_events = set() # Track added events to prevent duplicates
882
+
883
+ # Process calendar events
884
+ for event in events:
885
+ event_key = f"{event.date}_{event.activity}_{event.semester}"
886
+ if event_key not in added_events:
887
+ added_events.add(event_key)
888
+ self.events.append(event)
889
+ event_idx = len(self.events) - 1
890
+
891
+ # Update indices
892
+ if event.event_type not in self.event_type_index:
893
+ self.event_type_index[event.event_type] = []
894
+ self.event_type_index[event.event_type].append(event_idx)
895
+
896
+ if event.semester not in self.semester_index:
897
+ self.semester_index[event.semester] = []
898
+ self.semester_index[event.semester].append(event_idx)
899
+
900
+ # Create document
901
+ text = event.to_searchable_text()
902
+ embedding = self._compute_embedding(text)
903
+ doc = Document(
904
+ id=self._generate_unique_id(),
905
+ content=text,
906
+ embedding=embedding,
907
+ meta={
908
+ 'event_type': event.event_type,
909
+ 'semester': event.semester,
910
+ 'date': event.date,
911
+ 'event_idx': event_idx
912
+ }
913
+ )
914
+ documents.append(doc)
915
+ self.cache_manager.set_document_cache(str(event_idx), doc)
916
+
917
+ # Process contact details
918
+ if contact_details:
919
+ for contact in contact_details:
920
+ self.contact_data.append(contact)
921
+ text = f"""
922
+ ข้อมูลการติดต่อ:
923
+ คณะ: {contact.faculty}
924
+ ภาควิชา: {contact.department}
925
+ มหาวิทยาลัย: {contact.university}
926
+ สถานที่: {contact.location}
927
+
928
+ การติดต่อ:
929
+ อีเมล: {contact.contact.email}
930
+ Facebook: {json.dumps(contact.contact.facebook, ensure_ascii=False)}
931
+
932
+ การเดินทาง:
933
+ เรือ: {contact.transportation.boat}
934
+ BTS: {contact.transportation.bts}
935
+ MRT: {contact.transportation.mrt}
936
+ Airport Link: {contact.transportation.airport_link}
937
+ รถประจำทาง: {json.dumps(contact.transportation.bus, ensure_ascii=False)}
938
+ """
939
+ embedding = self._compute_embedding(text)
940
+ doc = Document(
941
+ id=self._generate_unique_id(),
942
+ content=text,
943
+ embedding=embedding,
944
+ meta={'event_type': 'contact'}
945
+ )
946
+ documents.append(doc)
947
+
948
+ # Process course structure
949
+ if course_structure:
950
+ for course in course_structure:
951
+ self.course_data.append(course)
952
+ text = f"""
953
+ โครงสร้างหลักสูตร:
954
+ ชื่อหลักสูตร: {course.program_name}
955
+ ภาควิชา: {course.department}
956
+ หน่วยกิตรวม: {course.total_credits}
957
+ ระดับการศึกษา: {course.degree_level}
958
+
959
+ รายละเอียดโครงสร้าง:
960
+ """
961
+ for category_name, category in course.structure.items():
962
+ text += f"\n{category_name}:\n"
963
+ if category.description:
964
+ text += f"คำอธิบาย: {category.description}\n"
965
+ text += f"หน่วยกิต: {category.credits}\n"
966
+ if category.minimum_credits:
967
+ text += f"หน่วยกิตขั้นต่ำ: {category.minimum_credits}\n"
968
+ text += "รายวิชา:\n"
969
+ for course_item in category.courses:
970
+ text += f"- {course_item.code}: {course_item.title_th} ({course_item.title_en}) - {course_item.credits} หน่วยกิต\n"
971
+
972
+ embedding = self._compute_embedding(text)
973
+ doc = Document(
974
+ id=self._generate_unique_id(),
975
+ content=text,
976
+ embedding=embedding,
977
+ meta={'event_type': 'curriculum'}
978
+ )
979
+ documents.append(doc)
980
+
981
+ # Process study plans
982
+ if study_plans:
983
+ for plan in study_plans:
984
+ self.study_plan_data.append(plan)
985
+ text = "แผนการศึกษา:\n"
986
+ for year, semesters in plan.years.items():
987
+ text += f"\nปีที่ {year}:\n"
988
+ for semester, data in semesters.items():
989
+ text += f"\n{semester}:\n"
990
+ if 'metadata' in data and data['metadata']:
991
+ text += f"ข้อมูลเพิ่มเติม: {json.dumps(data['metadata'], ensure_ascii=False)}\n"
992
+ if 'courses' in data:
993
+ for course in data['courses']:
994
+ text += f"- {course['code']}: {course['title'].get('th', '')} ({course['title'].get('en', '')}) - {course['credits']} หน่วยกิต\n"
995
+
996
+ embedding = self._compute_embedding(text)
997
+ doc = Document(
998
+ id=self._generate_unique_id(),
999
+ content=text,
1000
+ embedding=embedding,
1001
+ meta={'event_type': 'study_plan'}
1002
+ )
1003
+ documents.append(doc)
1004
+
1005
+ batch_size = 10
1006
+ for i in range(0, len(documents), batch_size):
1007
+ batch = documents[i:i + batch_size]
1008
+ try:
1009
+ self.store.write_documents(batch)
1010
+ except Exception as e:
1011
+ logger.error(f"Error writing document batch {i//batch_size + 1}: {str(e)}")
1012
+ for doc in batch:
1013
+ try:
1014
+ self.store.write_documents([doc])
1015
+ except Exception as e2:
1016
+ logger.error(f"Failed to write document {doc.id}: {str(e2)}")
1017
+
1018
+ def hybrid_search(self,
1019
+ query: str,
1020
+ event_type: Optional[str] = None,
1021
+ semester: Optional[str] = None,
1022
+ top_k: int = 10,
1023
+ weight_semantic: float = 0.5) -> List[Document]:
1024
+ """Hybrid search combining semantic and lexical search results"""
1025
+
1026
  cache_key = json.dumps({
1027
  'query': query,
1028
  'event_type': event_type,
1029
  'semester': semester,
1030
+ 'top_k': top_k,
1031
+ 'weight_semantic': weight_semantic
1032
  })
1033
+
1034
  cached_results = self.cache_manager.get_query_cache(cache_key)
1035
  if cached_results is not None:
1036
  return cached_results
1037
+
1038
+ # Get semantic search results
1039
  query_embedding = self._compute_embedding(query)
1040
+ semantic_results = self.embedding_retriever.run(
1041
+ query_embedding=query_embedding
1042
+ )["documents"]
1043
 
1044
+ # Get BM25 results
1045
+ bm25_results = self.bm25_retriever.run(
1046
+ query=query
1047
+ )["documents"]
 
1048
 
1049
+ # Combine results using score fusion
1050
+ combined_results = self._merge_results(
1051
+ semantic_results=semantic_results,
1052
+ bm25_results=bm25_results,
1053
+ weight_semantic=weight_semantic,
1054
+ top_k=top_k
1055
+ )
1056
 
1057
+ # Filter results based on metadata
1058
  filtered_results = []
1059
+ for doc in combined_results:
1060
+ if event_type and doc.meta.get('event_type') != event_type:
1061
  continue
1062
+ if semester and doc.meta.get('semester') != semester:
1063
  continue
1064
  filtered_results.append(doc)
1065
 
1066
  final_results = filtered_results[:top_k]
 
 
1067
  self.cache_manager.set_query_cache(cache_key, final_results)
1068
 
1069
  return final_results
1070
+
1071
+ def _merge_results(self,
1072
+ semantic_results: List[Document],
1073
+ bm25_results: List[Document],
1074
+ weight_semantic: float,
1075
+ top_k: int) -> List[Document]:
1076
+ """Merge semantic and BM25 results using weighted score fusion"""
1077
+
1078
+ # Create dictionaries to store normalized scores
1079
+ semantic_scores = {}
1080
+ bm25_scores = {}
1081
+
1082
+ # Normalize semantic scores
1083
+ max_semantic_score = max(doc.score for doc in semantic_results) if semantic_results else 1.0
1084
+ for doc in semantic_results:
1085
+ semantic_scores[doc.id] = doc.score / max_semantic_score if max_semantic_score > 0 else 0
1086
+
1087
+ # Normalize BM25 scores
1088
+ max_bm25_score = max(doc.score for doc in bm25_results) if bm25_results else 1.0
1089
+ for doc in bm25_results:
1090
+ bm25_scores[doc.id] = doc.score / max_bm25_score if max_bm25_score > 0 else 0
1091
+
1092
+ # Combine scores
1093
+ combined_scores = {}
1094
+ all_docs = {doc.id: doc for doc in semantic_results + bm25_results}
1095
+
1096
+ for doc_id in all_docs:
1097
+ semantic_score = semantic_scores.get(doc_id, 0)
1098
+ bm25_score = bm25_scores.get(doc_id, 0)
1099
+
1100
+ # Weighted combination
1101
+ combined_scores[doc_id] = (
1102
+ weight_semantic * semantic_score +
1103
+ (1 - weight_semantic) * bm25_score
1104
+ )
1105
+
1106
+ # Sort by combined score and return top_k results
1107
+ sorted_docs = sorted(
1108
+ all_docs.values(),
1109
+ key=lambda x: combined_scores[x.id],
1110
+ reverse=True
1111
+ )
1112
+
1113
+ return sorted_docs[:top_k]
1114
 
1115
  class AdvancedQueryProcessor:
1116
  """Process queries with better understanding"""
 
1122
  )
1123
  self.prompt_builder = PromptBuilder(
1124
  template="""
1125
+ วิเคราะห์คำถามที่เกี่ยวข้องกับปฏิทินการศึกษา (ภาษาไทย):
1126
+ คำถาม: {{query}}
1127
 
1128
+ ระบุ:
1129
+ 1. ประเภทของข้อมูลที่ต้องการค้นหา
1130
+ 2. ภาคการศึกษาที่ระบุไว้ (ถ้ามี)
1131
+ 3. คำสำคัญที่เกี่ยวข้อง
1132
 
1133
+ ให้ผลลัพธ์ในรูปแบบ JSON:
1134
  {
1135
+ "event_type": "ลงทะเบียน|กำหนดเวลา|การสอบ|วิชาการ|วันหยุด",
1136
+ "semester": "ภาคการศึกษาที่ระบุ หรือ null",
1137
+ "key_terms": ["คำสำคัญ 3 คำที่สำคัญที่สุด"],
1138
+ "response_format": "รายการ|คำตอบเดียว|คำตอบละเอียด"
1139
  }
1140
+ """
1141
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1142
 
1143
  def _get_default_analysis(self, query: str) -> Dict[str, Any]:
1144
  """Return default analysis when processing fails"""
 
1150
  "key_terms": [],
1151
  "response_format": "detailed"
1152
  }
1153
+
1154
+ def process_query(self, query: str) -> Dict[str, Any]:
1155
+ """Enhanced query processing with better error handling."""
1156
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1157
  try:
1158
+ result = self.prompt_builder.run(query=query)
1159
+ response = self.generator.run(prompt=result["prompt"])
1160
+
1161
+ if not response or not response.get("replies") or not response["replies"][0]:
1162
+ logger.warning("Received empty response from OpenAI")
1163
+ return self._get_default_analysis(query)
1164
+
1165
+ try:
1166
+ analysis = json.loads(response["replies"][0])
1167
+ except json.JSONDecodeError as je:
1168
+ return self._get_default_analysis(query)
1169
+
1170
+ # **Ensure course-related queries retrieve study plans & curricula**
1171
+ course_keywords = ['หน่วยกิต', 'วิชา', 'หลักสูตร', 'แผนการเรียน', 'วิชาเลือก', 'วิชาบังคับ', 'วิชาการค้นคว้า', 'วิชาหลัก']
1172
+ if any(keyword in query for keyword in course_keywords):
1173
+ analysis['event_type'] = 'curriculum'
1174
+
1175
+ # **Ensure fee-related queries retrieve tuition fee documents**
1176
+ fee_keywords = ['ค่าเทอม', 'ค่าธรรมเนียม', 'ค่าเรียน', 'ค่าปรับ']
1177
+ if any(keyword in query for keyword in fee_keywords):
1178
+ analysis['event_type'] = 'fees'
1179
+
1180
+ return {
1181
+ "original_query": query,
1182
+ **analysis
1183
  }
1184
+
1185
+ except Exception as e:
1186
+ logger.error(f"Query processing failed: {str(e)}")
1187
+ return self._get_default_analysis(query)
 
 
 
 
 
 
 
 
 
 
1188
 
1189
  class ResponseGenerator:
1190
  """Generate responses with better context utilization"""
 
1196
  )
1197
  self.prompt_builder = PromptBuilder(
1198
  template="""
1199
+ คุณเป็นที่ปรึกษาทางวิชาการ กรุณาตอบคำถามต่อไปนี้โดยใช้ข้อมูลจากปฏิทินการศึกษาที่ให้มา
1200
+
1201
+ คำถาม: {{query}}
1202
+
1203
+ ข้อมูลที่เกี่ยวข้องจากปฏิทินการศึกษา:
1204
  {% for doc in context %}
1205
+ ---
1206
  {{doc.content}}
1207
  {% endfor %}
1208
 
1209
+ **ห้ามเดาข้อมูลเอง ถ้าไม่มีข้อมูลให้ตอบว่า "ไม่มีข้อมูลที่ตรงกับคำถาม"**
1210
+
1211
+ กรุณาตอบเป็นภาษาไทย:
1212
+
1213
+ ต้องบอกเสมอว่า **หากมีข้อสงสัยเพิ่มเติมสามารถสอบถามได้**
1214
+ """
1215
+ )
1216
+
 
 
 
1217
  def generate_response(self,
1218
  query: str,
1219
  documents: List[Document],
 
1234
  return "ขออภัย ไม่สามารถประมวลผลคำตอบได้ในขณะนี้"
1235
 
1236
  class AcademicCalendarRAG:
1237
+ """Enhanced RAG system for academic calendar and program information"""
1238
 
1239
  def __init__(self, config: PipelineConfig):
1240
  self.config = config
1241
+ self.document_store = HybridDocumentStore(config) # Use the new hybrid store
1242
  self.query_processor = AdvancedQueryProcessor(config)
1243
  self.response_generator = ResponseGenerator(config)
1244
+ self.data_processor = CalendarDataProcessor()
1245
+
1246
+ # Initialize data containers
1247
+ self.calendar_events = []
1248
+ self.program_details = []
1249
+ self.contact_details = []
1250
+ self.course_structure = []
1251
+ self.study_plans = []
1252
+ self.tuition_fees = []
1253
+
1254
+ def load_data(self, json_data: Dict):
1255
+ """Load and process all data sources"""
1256
+ try:
1257
+ raw_events = self.data_processor.parse_calendar_json(json_data)
1258
+ for event in raw_events:
1259
+ if not event.event_type:
1260
+ event.event_type = CalendarEvent.classify_event_type(event.activity)
1261
+ self.calendar_events.append(event)
1262
+
1263
+ # Process other data types
1264
+ self.program_details = self.data_processor.extract_program_details(json_data)
1265
+ self.contact_details = self.data_processor.extract_contact_details(json_data)
1266
+ self.course_structure = self.data_processor.extract_course_structure(json_data)
1267
+ self.study_plans = self.data_processor.extract_program_study_plan(json_data)
1268
+ self.tuition_fees = self.data_processor.extract_fees(json_data)
1269
+
1270
+ self._add_calendar_events()
1271
+ self._add_program_info()
1272
+
1273
+ except Exception as e:
1274
+ logger.error(f"Error loading data: {str(e)}")
1275
+ raise
1276
+
1277
+ def _add_calendar_events(self):
1278
+ """Add calendar events and other data to document store"""
1279
+ if self.calendar_events:
1280
+ self.document_store.add_events(
1281
+ events=self.calendar_events,
1282
+ contact_details=self.contact_details,
1283
+ course_structure=self.course_structure,
1284
+ study_plans=self.study_plans
1285
+ )
1286
+
1287
+ def _add_program_info(self):
1288
+ """Enhanced method to add program-related information to document store"""
1289
+ if self.program_details:
1290
+ for detail in self.program_details:
1291
+ text = f"""
1292
+ ข้อมูลการสมัคร:
1293
+ เว็บไซต์รับสมัคร: {detail.application_info.application_portal}
1294
+ อีเมล: {detail.application_info.program_email}
1295
+
1296
+ เอกสารที่ต้องใช้:
1297
+ {self._format_required_docs(detail.required_documents)}
1298
+
1299
+ ขั้นตอนการส่งเอกสาร:
1300
+ {detail.submission_process}
1301
+
1302
+ ขั้นตอนการคัดเลือก:
1303
+ {self._format_selection_steps(detail.selection_process)}
1304
+ """
1305
+ self.document_store.add_document(text, "program_details")
1306
 
1307
+ if self.tuition_fees:
1308
+ for fee in self.tuition_fees:
1309
+ text = f"""
1310
+ ค่าธรรมเนียมการศึกษา:
1311
+ ค่าเล่าเรียนปกติ: {fee.regular_fee.amount:,.2f} {fee.regular_fee.currency} {fee.regular_fee.period}
1312
+ ค่าปรับชำระล่าช้า: {fee.late_payment_fee.amount:,.2f} {fee.late_payment_fee.currency}
1313
+ """
1314
+ self.document_store.add_document(text, "fees")
1315
+
1316
+ def _format_required_docs(self, docs: Dict) -> str:
1317
+ """Format required documents information with detailed English proficiency requirements"""
1318
+ result = []
1319
 
1320
+ if 'mandatory' in docs:
1321
+ result.append("เอกสารที่ต้องใช้:")
1322
+ for doc in docs['mandatory'].values():
1323
+ result.append(f"- {doc.name}: {doc.description}")
1324
+
1325
+ if 'optional' in docs:
1326
+ result.append("\nเอกสารเพิ่มเติม:")
1327
+ for doc_key, doc in docs['optional'].items():
1328
+ if doc_key == 'english_proficiency':
1329
+ result.append(f"- {doc.name}")
1330
+ # Parse and format the accepted tests
1331
+ try:
1332
+ accepted_tests = eval(doc.description)
1333
+ result.append(" เกณฑ์คะแนนที่ยอมรับ:")
1334
+ for test, requirement in accepted_tests.items():
1335
+ result.append(f" * {test}: {requirement}")
1336
+ except:
1337
+ result.append(f" {doc.description}")
1338
+
1339
+ if doc.conditions:
1340
+ conditions = doc.conditions.split(', ')
1341
+ for condition in conditions:
1342
+ result.append(f" {condition}")
1343
+ else:
1344
+ desc = f"- {doc.name}"
1345
+ if doc.conditions:
1346
+ desc += f" ({doc.conditions})"
1347
+ result.append(desc)
1348
+
1349
+ return "\n".join(result)
1350
+
1351
+ def _format_selection_steps(self, steps: List[SelectionStep]) -> str:
1352
+ """Format selection process steps"""
1353
+ return "\n".join(f"{step.step_number}. {step.description}" for step in steps)
1354
+
1355
+ def _get_fee_documents(self) -> List[Document]:
1356
+ """Get fee-related documents"""
1357
+ if not self.tuition_fees:
1358
+ return []
1359
+
1360
+ documents = []
1361
+ for fee in self.tuition_fees:
1362
+ text = f"""
1363
+ ค่าธรรมเนียมการศึกษา:
1364
+ - ค่าเล่าเรียน: {fee.regular_fee.amount:,.2f} {fee.regular_fee.currency} {fee.regular_fee.period}
1365
+ - ค่าปรับชำระล่าช้า: {fee.late_payment_fee.amount:,.2f} {fee.late_payment_fee.currency}
1366
+ """
1367
+ doc = Document(
1368
+ content=text,
1369
+ meta={"event_type": "fees"}
1370
+ )
1371
+ documents.append(doc)
1372
+
1373
+ return documents
1374
+
1375
+ def process_query(self, query: str, weight_semantic: float = 0.5) -> Dict[str, Any]:
1376
+ """Process user query using hybrid retrieval"""
1377
  try:
1378
  # Analyze query
1379
  query_info = self.query_processor.process_query(query)
1380
 
1381
+ # Get relevant documents using hybrid search
1382
+ documents = self.document_store.hybrid_search(
1383
  query=query,
1384
+ event_type=query_info.get("event_type"),
1385
+ semester=query_info.get("semester"),
1386
+ top_k=self.config.retriever.top_k,
1387
+ weight_semantic=weight_semantic
1388
  )
1389
 
1390
+ # Add fee information for fee-related queries
1391
+ if query_info.get("event_type") == "fees" and self.tuition_fees:
1392
+ fee_docs = self._get_fee_documents()
1393
+ documents.extend(fee_docs)
1394
+
1395
  # Generate response
1396
  response = self.response_generator.generate_response(
1397
  query=query,
 
1400
  )
1401
 
1402
  return {
1403
+ "query": query,
1404
  "answer": response,
1405
+ "relevant_docs": documents,
1406
  "query_info": query_info
1407
  }
1408
 
1409
  except Exception as e:
1410
+ logger.error(f"Error processing query: {str(e)}")
1411
  return {
1412
+ "query": query,
1413
+ "answer": "ขออภัย ไม่สามารถประมวลผลคำตอบได้ในขณะนี้",
1414
+ "error": str(e)
1415
  }
 
1416
  # def main():
1417
+ # """Main function demonstrating hybrid retrieval"""
1418
  # try:
1419
  # # Load API key
1420
  # with open("key.txt", "r") as f:
1421
  # openai_api_key = f.read().strip()
1422
 
1423
+ # # Create config with hybrid retrieval settings
1424
  # config = create_default_config(openai_api_key)
 
 
1425
  # config.localization.enable_thai_normalization = True
1426
+ # config.retriever.top_k = 5
1427
+ # config.model.temperature = 0.3
1428
 
 
1429
  # pipeline = AcademicCalendarRAG(config)
1430
 
1431
+ # # Load and process data
1432
+ # with open("raw-data.json", "r", encoding="utf-8") as f:
1433
+ # raw_data = json.load(f)
 
1434
 
1435
+ # pipeline.load_data(raw_data)
1436
+
1437
+ # # Test queries with different semantic weights
1438
+ # queries = ["เปิดเทอมวันเเรกวันไหน"]
1439
 
 
1440
  # print("=" * 80)
1441
 
1442
  # for query in queries:
 
1443
  # print(f"\nQuery: {query}")
1444
+ # result = pipeline.process_query(query, weight_semantic=0.3)
1445
  # print(f"Answer: {result['answer']}")
1446
+ # print("-" * 40)
 
 
 
 
 
 
 
 
 
 
 
 
1447
 
1448
  # except Exception as e:
1449
  # logger.error(f"Pipeline execution failed: {str(e)}")