Spaces:

JirasakJo
/

Questions_Graduate_Studies_Calendar_2024

Running

App Files Files Community

Questions_Graduate_Studies_Calendar_2024 / calendar_rag.py

JirasakJo

Update calendar_rag.py

d3040a9 verified 7 months ago

raw

history blame

59.5 kB

	from haystack import *
	from haystack.components.generators.openai import OpenAIGenerator
	from haystack.components.builders import PromptBuilder
	from haystack.components.embedders import SentenceTransformersDocumentEmbedder
	from haystack.components.retrievers.in_memory import *
	from haystack.document_stores.in_memory import *
	from haystack.utils import Secret
	from pathlib import Path
	import hashlib
	from datetime import *
	from typing import *
	from dataclasses import *
	import json
	import logging
	import re
	import pickle

	# Setup logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	@dataclass
	class ValidationResult:
	"""Stores the result of a validation check"""
	is_valid: bool
	errors: List[str]
	warnings: List[str]
	normalized_data: Dict[str, str]

	@dataclass
	class ApplicationInfo:
	application_portal: str
	program_email: str

	@dataclass
	class RequiredDocument:
	name: str
	description: str
	conditions: Optional[str] = None

	@dataclass
	class SelectionStep:
	step_number: str
	description: str

	@dataclass
	class ProgramDetailInfo:
	application_info: ApplicationInfo
	required_documents: Dict[str, Dict[str, RequiredDocument]]
	submission_process: str
	selection_process: List[SelectionStep]

	@dataclass
	class Transportation:
	boat: str
	bts: str
	mrt: str
	airport_link: str
	bus: Dict[str, str]

	@dataclass
	class Contact:
	email: str
	facebook: Dict[str, str]

	@dataclass
	class ContactDetail:
	event_type: str
	department: str
	faculty: str
	university: str
	location: str
	contact: Contact
	transportation: Transportation

	@dataclass
	class Course:
	code: str
	title_th: str
	title_en: str
	credits: int

	@dataclass
	class CourseCategory:
	description: Optional[str]
	credits: Union[str, int]
	minimum_credits: Optional[int]
	courses: List[Course]

	@dataclass
	class CourseStructure:
	event_type: str
	program_name: str
	department: str
	total_credits: int
	degree_level: str
	structure: Dict[str, CourseCategory]

	@dataclass
	class StudyPlan:
	event_type: str
	years: Dict[str, Dict[str, Any]]

	@dataclass
	class RegularFee:
	amount: float
	currency: str
	period: str

	@dataclass
	class LatePaymentFee:
	amount: float
	currency: str

	@dataclass
	class TuitionFee:
	event_type: str
	regular_fee: RegularFee
	late_payment_fee: LatePaymentFee


	class OpenAIDateParser:
	"""Uses OpenAI to parse complex Thai date formats"""

	def __init__(self, api_key: str, model: str = "gpt-4o"):
	self.generator = OpenAIGenerator(
	api_key=Secret.from_token(api_key),
	model=model
	)
	self.prompt_builder = PromptBuilder(
	template="""
	Parse the following Thai date range into a structured format:
	Date: {{date}}

	Return in JSON format:
	{
	"start_date": "YYYY-MM-DD",
	"end_date": "YYYY-MM-DD" (if range),
	"is_range": true/false
	}

	Notes:
	- Convert Buddhist Era (BE) to CE
	- Handle abbreviated Thai months
	- Account for date ranges with dashes
	- Return null for end_date if it's a single date

	Example inputs and outputs:
	Input: "จ 8 ก.ค. – จ 19 ส.ค. 67"
	Output: {"start_date": "2024-07-08", "end_date": "2024-08-19", "is_range": true}

	Input: "15 มกราคม 2567"
	Output: {"start_date": "2024-01-15", "end_date": null, "is_range": false}
	"""
	)

	async def parse_date(self, date_str: str) -> Dict[str, Union[str, bool]]:
	"""Parse complex Thai date format using OpenAI"""
	try:
	result = self.prompt_builder.run(date=date_str)
	response = await self.generator.arun(prompt=result["prompt"])

	if not response or not response.get("replies"):
	raise ValueError("Empty response from OpenAI")

	parsed = json.loads(response["replies"][0])

	for date_field in ['start_date', 'end_date']:
	if parsed.get(date_field):
	datetime.strptime(parsed[date_field], '%Y-%m-%d')

	return parsed

	except Exception as e:
	logger.error(f"OpenAI date parsing failed for '{date_str}': {str(e)}")
	raise ValueError(f"Could not parse date: {date_str}")

	class ThaiTextPreprocessor:
	"""Handles Thai text preprocessing and normalization"""

	CHAR_MAP = {'ํา': 'ำ','์': '','–': '-','—': '-','٫': ',',}

	@classmethod
	def normalize_thai_text(cls, text: str) -> str:
	"""Normalize Thai text by applying character mappings and spacing rules"""
	if not text:
	return text

	for old, new in cls.CHAR_MAP.items():
	text = text.replace(old, new)

	text = re.sub(r'\s+', ' ', text.strip())

	thai_digits = '๐๑๒๓๔๕๖๗๘๙'
	arabic_digits = '0123456789'

	for thai, arabic in zip(thai_digits, arabic_digits):
	text = text.replace(thai, arabic)

	return text

	class CalendarEventValidator:
	"""Validates and preprocesses calendar events"""

	def __init__(self, openai_api_key: str):
	self.preprocessor = ThaiTextPreprocessor()
	self.date_parser = OpenAIDateParser(api_key=openai_api_key)

	async def validate_event(self, event: 'CalendarEvent') -> ValidationResult:
	"""Validate a calendar event and return validation results"""
	errors = []
	warnings = []
	normalized_data = {}

	if event.date:
	try:
	parsed_date = await self.date_parser.parse_date(event.date)
	normalized_data['date'] = parsed_date['start_date']

	if parsed_date['is_range'] and parsed_date['end_date']:
	range_note = f"ถึงวันที่ {parsed_date['end_date']}"
	if event.note:
	normalized_data['note'] = f"{event.note}; {range_note}"
	else:
	normalized_data['note'] = range_note

	except ValueError as e:
	errors.append(f"Invalid date format: {event.date}")
	else:
	errors.append("Date is required")

	if event.time:
	time_pattern = r'^([01]?[0-9]\|2[0-3]):([0-5][0-9])$'
	if not re.match(time_pattern, event.time):
	errors.append(f"Invalid time format: {event.time}")
	normalized_data['time'] = event.time

	if event.activity:
	normalized_activity = self.preprocessor.normalize_thai_text(event.activity)
	if len(normalized_activity) < 3:
	warnings.append("Activity description is very short")
	normalized_data['activity'] = normalized_activity
	else:
	errors.append("Activity is required")

	valid_semesters = {'ภาคต้น', 'ภาคปลาย', 'ภาคฤดูร้อน'}
	if event.semester:
	normalized_semester = self.preprocessor.normalize_thai_text(event.semester)
	if normalized_semester not in valid_semesters:
	warnings.append(f"Unusual semester value: {event.semester}")
	normalized_data['semester'] = normalized_semester
	else:
	errors.append("Semester is required")

	valid_types = {'registration', 'deadline', 'examination', 'academic', 'holiday'}
	if event.event_type not in valid_types:
	errors.append(f"Invalid event type: {event.event_type}")
	normalized_data['event_type'] = event.event_type

	if event.note and 'note' not in normalized_data:
	normalized_data['note'] = self.preprocessor.normalize_thai_text(event.note)

	if event.section:
	normalized_data['section'] = self.preprocessor.normalize_thai_text(event.section)

	return ValidationResult(
	is_valid=len(errors) == 0,
	errors=errors,
	warnings=warnings,
	normalized_data=normalized_data
	)

	@dataclass
	class CalendarEvent:
	"""Structured representation of a calendar event with validation"""
	date: str
	time: str
	activity: str
	note: str
	semester: str
	event_type: str
	section: Optional[str] = None

	@staticmethod
	def classify_event_type(activity: str) -> str:
	"""Classify event type based on activity description"""
	activity_lower = activity.lower()

	keywords = {
	'registration': ['ลงทะเบียน', 'ชําระเงิน', 'ค่าธรรมเนียม', 'เปิดเรียน'],
	'deadline': ['วันสุดท้าย', 'กําหนด', 'ภายใน', 'ต้องส่ง'],
	'examination': ['สอบ', 'ปริญญานิพนธ์', 'วิทยานิพนธ์', 'สอบปากเปล่า'],
	'holiday': ['วันหยุด', 'ชดเชย', 'เทศกาล'],
	}

	for event_type, terms in keywords.items():
	if any(term in activity_lower for term in terms):
	return event_type
	return 'academic'

	async def initialize(self, openai_api_key: str):
	"""Asynchronously validate and normalize the event"""
	validator = CalendarEventValidator(openai_api_key)
	result = await validator.validate_event(self)

	if not result.is_valid:
	raise ValueError(f"Invalid calendar event: {', '.join(result.errors)}")

	for field, value in result.normalized_data.items():
	setattr(self, field, value)

	if result.warnings:
	logger.warning(f"Calendar event warnings: {', '.join(result.warnings)}")

	def to_searchable_text(self) -> str:
	"""Convert event to searchable text format"""
	return f"""
	ภาคการศึกษา: {self.semester}
	ประเภท: {self.event_type}
	วันที่: {self.date}
	เวลา: {self.time or '-'}
	กิจกรรม: {self.activity}
	หมวดหมู่: {self.section or '-'}
	หมายเหตุ: {self.note or '-'}
	""".strip()

	class CacheManager:
	"""Manages caching for different components of the RAG pipeline"""

	def __init__(self, cache_dir: Path, ttl: int = 3600):
	"""
	Initialize CacheManager
	"""
	self.cache_dir = cache_dir
	self.ttl = ttl
	self.embeddings_cache = self._load_cache("embeddings")
	self.query_cache = self._load_cache("queries")
	self.document_cache = self._load_cache("documents")

	def _generate_key(self, data: Union[str, Dict, Any]) -> str:
	"""Generate a unique cache key"""
	if isinstance(data, str):
	content = data.encode('utf-8')
	else:
	content = json.dumps(data, sort_keys=True).encode('utf-8')
	return hashlib.md5(content).hexdigest()

	def _load_cache(self, cache_type: str) -> Dict:
	"""Load cache from disk"""
	cache_path = self.cache_dir / f"{cache_type}_cache.pkl"
	if cache_path.exists():
	try:
	with open(cache_path, 'rb') as f:
	cache = pickle.load(f)
	self._clean_expired_entries(cache)
	return cache
	except Exception as e:
	logger.warning(f"Failed to load {cache_type} cache: {e}")
	return {}
	return {}

	def _save_cache(self, cache_type: str, cache_data: Dict):
	"""Save cache to disk"""
	cache_path = self.cache_dir / f"{cache_type}_cache.pkl"
	try:
	with open(cache_path, 'wb') as f:
	pickle.dump(cache_data, f)
	except Exception as e:
	logger.error(f"Failed to save {cache_type} cache: {e}")

	def _clean_expired_entries(self, cache: Dict):
	"""Remove expired cache entries"""
	current_time = datetime.now()
	expired_keys = [
	key for key, (_, timestamp) in cache.items()
	if current_time - timestamp > timedelta(seconds=self.ttl)
	]
	for key in expired_keys:
	del cache[key]

	def get_embedding_cache(self, text: str) -> Optional[Any]:
	"""Get cached embedding for text"""
	key = self._generate_key(text)
	if key in self.embeddings_cache:
	embedding, timestamp = self.embeddings_cache[key]
	if datetime.now() - timestamp <= timedelta(seconds=self.ttl):
	return embedding
	return None

	def set_embedding_cache(self, text: str, embedding: Any):
	"""Cache embedding for text"""
	key = self._generate_key(text)
	self.embeddings_cache[key] = (embedding, datetime.now())
	self._save_cache("embeddings", self.embeddings_cache)

	def get_query_cache(self, query: str) -> Optional[Dict]:
	"""Get cached query results"""
	key = self._generate_key(query)
	if key in self.query_cache:
	result, timestamp = self.query_cache[key]
	if datetime.now() - timestamp <= timedelta(seconds=self.ttl):
	return result
	return None

	def set_query_cache(self, query: str, result: Dict):
	"""Cache query results"""
	key = self._generate_key(query)
	self.query_cache[key] = (result, datetime.now())
	self._save_cache("queries", self.query_cache)

	def set_document_cache(self, doc_id: str, document: Any):
	"""Cache document"""
	self.document_cache[doc_id] = (document, datetime.now())
	self._save_cache("documents", self.document_cache)

	@dataclass
	class ModelConfig:
	openai_api_key: str
	embedder_model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
	openai_model: str = "gpt-4o"
	temperature: float = 0.7

	@dataclass
	class RetrieverConfig:
	top_k: int = 5

	@dataclass
	class CacheConfig:
	enabled: bool = True
	cache_dir: Path = Path("./cache")
	ttl: int = 86400 # 24 hours

	@dataclass
	class ProcessingConfig:
	batch_size: int = 32

	@dataclass
	class LocalizationConfig:
	enable_thai_normalization: bool = True

	@dataclass
	class PipelineConfig:
	model: ModelConfig
	retriever: RetrieverConfig = field(default_factory=RetrieverConfig)
	cache: CacheConfig = field(default_factory=CacheConfig)
	processing: ProcessingConfig = field(default_factory=ProcessingConfig)
	localization: LocalizationConfig = field(default_factory=LocalizationConfig)

	def create_default_config(api_key: str) -> PipelineConfig:
	"""
	Create a default pipeline configuration with optimized settings for Thai language processing.

	Args:
	api_key (str): OpenAI API key

	Returns:
	PipelineConfig: Configured pipeline settings
	"""
	return PipelineConfig(
	model=ModelConfig(
	openai_api_key=api_key,
	temperature=0.3 # Lower temperature for more focused responses
	),
	retriever=RetrieverConfig(
	top_k=5 # Optimal number of documents to retrieve
	),
	cache=CacheConfig(
	enabled=True,
	cache_dir=Path("./cache"),
	ttl=86400 # 24 hour cache
	),
	processing=ProcessingConfig(
	batch_size=32 # Default batch size for processing
	),
	localization=LocalizationConfig(
	enable_thai_normalization=True # Enable Thai text normalization
	)
	)

	class CalendarDataProcessor:
	"""Process and structure calendar data from the new raw-data.json format"""

	@staticmethod
	def parse_calendar_json(json_data: Dict) -> List[CalendarEvent]:
	"""Parse the new calendar JSON format into CalendarEvent objects"""
	events = []

	# Extract academic calendar data - handle direct dictionary input
	calendar_data = json_data.get('academic_calendar', []) if isinstance(json_data, dict) else json_data

	for semester_block in calendar_data:
	semester = semester_block.get('education', '')
	schedule = semester_block.get('schedule', [])

	# Handle regular schedule events
	for event in schedule:
	if 'section' in event and 'details' in event:
	# Process section-based events (thesis deadlines, etc.)
	section = event['section']
	for detail in event['details']:
	if 'ภาคต้น' in detail and 'ภาคปลาย' in detail:
	# Handle dual-semester events
	for sem_key in ['ภาคต้น', 'ภาคปลาย']:
	if detail.get(sem_key):
	events.append(CalendarEvent(
	date=detail[sem_key],
	time='',
	activity=detail['title'],
	note=section,
	semester=sem_key,
	event_type='deadline',
	section=section
	))
	else:
	# Single semester event
	events.append(CalendarEvent(
	date=detail.get('date', ''),
	time='',
	activity=detail.get('title', ''),
	note=section,
	semester=ThaiTextPreprocessor.normalize_thai_text(semester),
	event_type='deadline',
	section=section
	))
	else:
	# Regular calendar event
	event_type = CalendarEvent.classify_event_type(event.get('activity', ''))

	# Clean semester string
	cleaned_semester = semester
	if '(' in semester:
	match = re.search(r'$(.*?)$', semester)
	if match:
	cleaned_semester = match.group(1)
	cleaned_semester = ThaiTextPreprocessor.normalize_thai_text(cleaned_semester)

	events.append(CalendarEvent(
	date=event.get('date', ''),
	time=event.get('time', ''),
	activity=event.get('activity', ''),
	note=event.get('note', ''),
	semester=cleaned_semester,
	event_type=event_type
	))

	return events

	@staticmethod
	def extract_program_details(json_data: Dict) -> ProgramDetailInfo:
	"""Extract and structure program details into ProgramDetailInfo object"""
	raw_details = json_data.get('program_details', {})

	# Process application info
	app_info_data = raw_details.get('application_info', {})
	app_info = ApplicationInfo(
	application_portal=app_info_data.get('application_portal', ''),
	program_email=app_info_data.get('program_email', '')
	)

	# Process required documents
	req_docs = {}
	raw_docs = raw_details.get('required_documents', {})

	# Process mandatory documents
	mandatory_docs = {}
	for doc_key, doc_value in raw_docs.get('mandatory', {}).items():
	mandatory_docs[doc_key] = RequiredDocument(
	name=doc_key,
	description=doc_value
	)
	req_docs['mandatory'] = mandatory_docs

	# Process optional documents
	optional_docs = {}
	for doc_key, doc_data in raw_docs.get('optional', {}).items():
	if doc_key == 'english_proficiency':
	ep_data = doc_data
	optional_docs[doc_key] = RequiredDocument(
	name=ep_data.get('name', ''),
	description=str(ep_data.get('accepted_tests', {})),
	conditions=f"Validity: {ep_data.get('validity', '')}, Benefits: {ep_data.get('benefits', '')}, Exemptions: {ep_data.get('exemptions', '')}"
	)
	else:
	optional_docs[doc_key] = RequiredDocument(
	name=doc_data.get('name', ''),
	description='',
	conditions=doc_data.get('condition', '')
	)
	req_docs['optional'] = optional_docs

	# Process selection steps
	selection_steps = []
	for step_data in raw_details.get('selection_process', {}).get('steps', []):
	for step_num, description in step_data.items():
	selection_steps.append(SelectionStep(
	step_number=step_num,
	description=description
	))

	return [ProgramDetailInfo(
	application_info=app_info,
	required_documents=req_docs,
	submission_process=raw_details.get('submission_process', ''),
	selection_process=selection_steps
	)]

	@staticmethod
	def extract_contact_details(json_data: Dict) -> List[ContactDetail]:
	"""Extract and structure contact details into ContactDetail objects"""
	raw_contacts = json_data.get('contact_details', [])
	contact_details = []

	# Handle the case where raw_contacts might be a single object instead of a list
	if not isinstance(raw_contacts, list):
	raw_contacts = [raw_contacts]

	for contact_data in raw_contacts:
	# Skip if contact_data is not a dictionary
	if not isinstance(contact_data, dict):
	continue

	try:
	# Process transportation data
	transportation_data = contact_data.get('transportation', {})
	transportation = Transportation(
	boat=transportation_data.get('boat', ''),
	bts=transportation_data.get('bts', ''),
	mrt=transportation_data.get('mrt', ''),
	airport_link=transportation_data.get('airport_link', ''),
	bus=transportation_data.get('bus', {})
	)

	# Process contact information
	contact_info = Contact(
	email=contact_data.get('email', ''),
	facebook=contact_data.get('facebook', {})
	)

	# Create ContactDetail object
	contact_details.append(ContactDetail(
	event_type=contact_data.get('event_type', ''),
	department=contact_data.get('department', ''),
	faculty=contact_data.get('faculty', ''),
	university=contact_data.get('university', ''),
	location=contact_data.get('location', ''),
	contact=contact_info,
	transportation=transportation
	))
	except Exception as e:
	print(f"Error processing contact data: {e}")
	continue

	return contact_details


	@staticmethod
	def extract_course_structure(json_data: Dict) -> List[CourseStructure]:
	"""Extract and structure course information into CourseStructure objects"""
	course_structures = []

	# Get course structure data
	course_data = json_data.get('course_structure', {})
	program_metadata = course_data.get('program_metadata', {})
	curriculum = course_data.get('curriculum_structure', {})

	# Process foundation courses
	foundation_data = curriculum.get('foundation_courses', {})
	foundation_courses = []
	for course in foundation_data.get('courses', []):
	foundation_courses.append(Course(
	code=course.get('code', ''),
	title_th=course.get('title', {}).get('th', ''),
	title_en=course.get('title', {}).get('en', ''),
	credits=course.get('credits', 0)
	))

	# Process core courses
	core_data = curriculum.get('core_courses', {})
	core_courses = []
	for course in core_data.get('modules', []):
	core_courses.append(Course(
	code=course.get('code', ''),
	title_th=course.get('title', {}).get('th', ''),
	title_en=course.get('title', {}).get('en', ''),
	credits=course.get('credits', 0)
	))

	# Process elective courses
	elective_data = curriculum.get('electives', {})
	elective_courses = []
	for course in elective_data.get('course_groups', []):
	elective_courses.append(Course(
	code=course.get('code', ''),
	title_th=course.get('title', {}).get('th', ''),
	title_en=course.get('title', {}).get('en', ''),
	credits=course.get('credits', 0)
	))

	# Process research courses
	research_data = curriculum.get('research', {})
	research_courses = []
	for course in research_data.get('course', []):
	research_courses.append(Course(
	code=course.get('code', ''),
	title_th=course.get('title', {}).get('th', ''),
	title_en=course.get('title', {}).get('en', ''),
	credits=course.get('credits', 0)
	))

	# Create course categories
	structure = {
	'หมวดวิชาปรับพื้นฐาน': CourseCategory( # Previously foundation_courses
	description=foundation_data.get('metadata', {}).get('description'),
	credits=foundation_data.get('metadata', {}).get('credits', 'non-credit'),
	minimum_credits=None,
	courses=foundation_courses
	),
	'หมวดวิชาบังคับ': CourseCategory( # Previously core_courses
	description=None,
	credits=0,
	minimum_credits=core_data.get('minimum_requirement_credits'),
	courses=core_courses
	),
	'หมวดวิชาเลือก': CourseCategory( # Previously elective_courses
	description=None,
	credits=0,
	minimum_credits=elective_data.get('minimum_requirement_credits'),
	courses=elective_courses
	),
	'หมวดวิชาการค้นคว้าอิสระ': CourseCategory( # Previously research_courses
	description=None,
	credits=0,
	minimum_credits=research_data.get('minimum_requirement_credits'),
	courses=research_courses
	)
	}

	# Create course structure
	course_structure = CourseStructure(
	event_type='curriculum_structure',
	program_name=program_metadata.get('name', ''),
	department=program_metadata.get('department', ''),
	total_credits=program_metadata.get('total_credits', 0),
	degree_level=program_metadata.get('degree_level', ''),
	structure=structure
	)

	return [course_structure]

	@staticmethod
	def extract_program_study_plan(json_data: Dict) -> List[StudyPlan]:
	"""Extract and structure study plan information into StudyPlan objects"""
	study_plan_data = json_data.get('program_study_plan', {})

	# Initialize the years dictionary to store all year/semester data
	years_dict = {}

	for year_key, year_data in study_plan_data.items():
	years_dict[year_key] = {}

	for semester_key, semester_data in year_data.items():
	# Get metadata
	metadata = semester_data.get('metadata', {})

	# Initialize semester structure
	semester_struct = {
	'metadata': metadata,
	'courses': []
	}

	# Handle both 'modules' and 'courses' keys
	course_data = semester_data.get('modules', []) or semester_data.get('courses', [])

	# Add courses to semester
	for course in course_data:
	course_info = {
	'code': course.get('code', ''),
	'title': course.get('title', {'th': '', 'en': ''}),
	'credits': course.get('credits', 0)
	}
	semester_struct['courses'].append(course_info)

	# Add semester data to year
	years_dict[year_key][semester_key] = semester_struct

	# Create StudyPlan object
	study_plan = StudyPlan(
	event_type='study_plan',
	years=years_dict
	)

	return [study_plan]

	@staticmethod
	def extract_fees(json_data: Dict) -> List[TuitionFee]:
	"""Extract and structure fee information into TuitionFee objects"""
	fees_data = json_data.get('fees', {})

	# Parse regular tuition fee
	regular_fee_str = fees_data.get('tuition', '')
	regular_amount = float(regular_fee_str.split()[0]) if regular_fee_str else 0

	regular_fee = RegularFee(
	amount=regular_amount,
	currency='THB',
	period='per semester'
	)

	# Parse late payment fee
	late_fee_str = fees_data.get('late_payment', '')
	late_amount = float(late_fee_str.split()[0]) if late_fee_str else 0

	late_payment_fee = LatePaymentFee(
	amount=late_amount,
	currency='THB'
	)

	# Create TuitionFee object
	tuition_fee = TuitionFee(
	event_type='tuition_fee',
	regular_fee=regular_fee,
	late_payment_fee=late_payment_fee
	)

	return [tuition_fee]

	class HybridDocumentStore:
	"""Enhanced document store with hybrid retrieval capabilities"""

	def __init__(self, config: PipelineConfig):
	self.store = InMemoryDocumentStore()
	self.embedder = SentenceTransformersDocumentEmbedder(
	model=config.model.embedder_model
	)
	# Initialize BM25 retriever
	self.bm25_retriever = InMemoryBM25Retriever(
	document_store=self.store,
	top_k=config.retriever.top_k
	)
	# Initialize embedding retriever
	self.embedding_retriever = InMemoryEmbeddingRetriever(
	document_store=self.store,
	top_k=config.retriever.top_k
	)
	self.cache_manager = CacheManager(
	cache_dir=config.cache.cache_dir,
	ttl=config.cache.ttl
	)

	self.embedder.warm_up()

	# Initialize containers
	self.events = []
	self.event_type_index = {}
	self.semester_index = {}
	self._document_counter = 0

	# Additional data containers
	self.course_data = []
	self.contact_data = []
	self.study_plan_data = []

	def _generate_unique_id(self) -> str:
	"""Generate a unique document ID"""
	self._document_counter += 1
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	return f"doc_{timestamp}_{self._document_counter}"

	def _compute_embedding(self, text: str) -> Any:
	"""Compute embedding with caching"""
	cached_embedding = self.cache_manager.get_embedding_cache(text)
	if cached_embedding is not None:
	return cached_embedding

	doc = Document(content=text)
	embedding = self.embedder.run(documents=[doc])["documents"][0].embedding
	self.cache_manager.set_embedding_cache(text, embedding)
	return embedding

	def add_document(self, text: str, event_type: str):
	"""Add a single document to the store"""
	try:
	# Compute embedding
	embedding = self._compute_embedding(text)

	# Create document with unique ID
	doc = Document(
	id=self._generate_unique_id(),
	content=text,
	embedding=embedding,
	meta={'event_type': event_type}
	)

	# Write document
	self.store.write_documents([doc])

	# Cache document
	self.cache_manager.set_document_cache(doc.id, doc)

	except Exception as e:
	logger.error(f"Error adding document: {str(e)}")
	raise

	def add_events(self, events: List[CalendarEvent], contact_details: Optional[List[ContactDetail]] = None,
	course_structure: Optional[List[CourseStructure]] = None,
	study_plans: Optional[List[StudyPlan]] = None):
	"""Add events and additional data with caching"""
	documents = []
	added_events = set() # Track added events to prevent duplicates

	# Process calendar events
	for event in events:
	event_key = f"{event.date}_{event.activity}_{event.semester}"
	if event_key not in added_events:
	added_events.add(event_key)
	self.events.append(event)
	event_idx = len(self.events) - 1

	# Update indices
	if event.event_type not in self.event_type_index:
	self.event_type_index[event.event_type] = []
	self.event_type_index[event.event_type].append(event_idx)

	if event.semester not in self.semester_index:
	self.semester_index[event.semester] = []
	self.semester_index[event.semester].append(event_idx)

	# Create document
	text = event.to_searchable_text()
	embedding = self._compute_embedding(text)
	doc = Document(
	id=self._generate_unique_id(),
	content=text,
	embedding=embedding,
	meta={
	'event_type': event.event_type,
	'semester': event.semester,
	'date': event.date,
	'event_idx': event_idx
	}
	)
	documents.append(doc)
	self.cache_manager.set_document_cache(str(event_idx), doc)

	# Process contact details
	if contact_details:
	for contact in contact_details:
	self.contact_data.append(contact)
	text = f"""
	ข้อมูลการติดต่อ:
	คณะ: {contact.faculty}
	ภาควิชา: {contact.department}
	มหาวิทยาลัย: {contact.university}
	สถานที่: {contact.location}

	การติดต่อ:
	อีเมล: {contact.contact.email}
	Facebook: {json.dumps(contact.contact.facebook, ensure_ascii=False)}

	การเดินทาง:
	เรือ: {contact.transportation.boat}
	BTS: {contact.transportation.bts}
	MRT: {contact.transportation.mrt}
	Airport Link: {contact.transportation.airport_link}
	รถประจำทาง: {json.dumps(contact.transportation.bus, ensure_ascii=False)}
	"""
	embedding = self._compute_embedding(text)
	doc = Document(
	id=self._generate_unique_id(),
	content=text,
	embedding=embedding,
	meta={'event_type': 'contact'}
	)
	documents.append(doc)

	# Process course structure
	if course_structure:
	for course in course_structure:
	self.course_data.append(course)
	text = f"""
	โครงสร้างหลักสูตร:
	ชื่อหลักสูตร: {course.program_name}
	ภาควิชา: {course.department}
	หน่วยกิตรวม: {course.total_credits}
	ระดับการศึกษา: {course.degree_level}

	รายละเอียดโครงสร้าง:
	"""
	for category_name, category in course.structure.items():
	text += f"\n{category_name}:\n"
	if category.description:
	text += f"คำอธิบาย: {category.description}\n"
	text += f"หน่วยกิต: {category.credits}\n"
	if category.minimum_credits:
	text += f"หน่วยกิตขั้นต่ำ: {category.minimum_credits}\n"
	text += "รายวิชา:\n"
	for course_item in category.courses:
	text += f"- {course_item.code}: {course_item.title_th} ({course_item.title_en}) - {course_item.credits} หน่วยกิต\n"

	embedding = self._compute_embedding(text)
	doc = Document(
	id=self._generate_unique_id(),
	content=text,
	embedding=embedding,
	meta={'event_type': 'curriculum'}
	)
	documents.append(doc)

	# Process study plans
	if study_plans:
	for plan in study_plans:
	self.study_plan_data.append(plan)
	text = "แผนการศึกษา:\n"
	for year, semesters in plan.years.items():
	text += f"\nปีที่ {year}:\n"
	for semester, data in semesters.items():
	text += f"\n{semester}:\n"
	if 'metadata' in data and data['metadata']:
	text += f"ข้อมูลเพิ่มเติม: {json.dumps(data['metadata'], ensure_ascii=False)}\n"
	if 'courses' in data:
	for course in data['courses']:
	text += f"- {course['code']}: {course['title'].get('th', '')} ({course['title'].get('en', '')}) - {course['credits']} หน่วยกิต\n"

	embedding = self._compute_embedding(text)
	doc = Document(
	id=self._generate_unique_id(),
	content=text,
	embedding=embedding,
	meta={'event_type': 'study_plan'}
	)
	documents.append(doc)

	batch_size = 10
	for i in range(0, len(documents), batch_size):
	batch = documents[i:i + batch_size]
	try:
	self.store.write_documents(batch)
	except Exception as e:
	logger.error(f"Error writing document batch {i//batch_size + 1}: {str(e)}")
	for doc in batch:
	try:
	self.store.write_documents([doc])
	except Exception as e2:
	logger.error(f"Failed to write document {doc.id}: {str(e2)}")

	def hybrid_search(self,
	query: str,
	event_type: Optional[str] = None,
	semester: Optional[str] = None,
	top_k: int = 10,
	weight_semantic: float = 0.5) -> List[Document]:
	"""Hybrid search combining semantic and lexical search results"""

	cache_key = json.dumps({
	'query': query,
	'event_type': event_type,
	'semester': semester,
	'top_k': top_k,
	'weight_semantic': weight_semantic
	})

	cached_results = self.cache_manager.get_query_cache(cache_key)
	if cached_results is not None:
	return cached_results

	# Get semantic search results
	query_embedding = self._compute_embedding(query)
	semantic_results = self.embedding_retriever.run(
	query_embedding=query_embedding
	)["documents"]

	# Get BM25 results
	bm25_results = self.bm25_retriever.run(
	query=query
	)["documents"]

	# Combine results using score fusion
	combined_results = self._merge_results(
	semantic_results=semantic_results,
	bm25_results=bm25_results,
	weight_semantic=weight_semantic,
	top_k=top_k
	)

	# Filter results based on metadata
	filtered_results = []
	for doc in combined_results:
	if event_type and doc.meta.get('event_type') != event_type:
	continue
	if semester and doc.meta.get('semester') != semester:
	continue
	filtered_results.append(doc)

	final_results = filtered_results[:top_k]
	self.cache_manager.set_query_cache(cache_key, final_results)

	return final_results

	def _merge_results(self,
	semantic_results: List[Document],
	bm25_results: List[Document],
	weight_semantic: float,
	top_k: int) -> List[Document]:
	"""Merge semantic and BM25 results using weighted score fusion"""

	# Create dictionaries to store normalized scores
	semantic_scores = {}
	bm25_scores = {}

	# Normalize semantic scores
	max_semantic_score = max(doc.score for doc in semantic_results) if semantic_results else 1.0
	for doc in semantic_results:
	semantic_scores[doc.id] = doc.score / max_semantic_score if max_semantic_score > 0 else 0

	# Normalize BM25 scores
	max_bm25_score = max(doc.score for doc in bm25_results) if bm25_results else 1.0
	for doc in bm25_results:
	bm25_scores[doc.id] = doc.score / max_bm25_score if max_bm25_score > 0 else 0

	# Combine scores
	combined_scores = {}
	all_docs = {doc.id: doc for doc in semantic_results + bm25_results}

	for doc_id in all_docs:
	semantic_score = semantic_scores.get(doc_id, 0)
	bm25_score = bm25_scores.get(doc_id, 0)

	# Weighted combination
	combined_scores[doc_id] = (
	weight_semantic * semantic_score +
	(1 - weight_semantic) * bm25_score
	)

	# Sort by combined score and return top_k results
	sorted_docs = sorted(
	all_docs.values(),
	key=lambda x: combined_scores[x.id],
	reverse=True
	)

	return sorted_docs[:top_k]

	class AdvancedQueryProcessor:
	"""Process queries with better understanding"""

	def __init__(self, config: PipelineConfig):
	self.generator = OpenAIGenerator(
	api_key=Secret.from_token(config.model.openai_api_key),
	model=config.model.openai_model
	)
	self.prompt_builder = PromptBuilder(
	template="""
	วิเคราะห์คำถามที่เกี่ยวข้องกับปฏิทินการศึกษา (ภาษาไทย):
	คำถาม: {{query}}

	ระบุ:
	1. ประเภทของข้อมูลที่ต้องการค้นหา
	2. ภาคการศึกษาที่ระบุไว้ (ถ้ามี)
	3. คำสำคัญที่เกี่ยวข้อง

	ให้ผลลัพธ์ในรูปแบบ JSON:
	{
	"event_type": "ลงทะเบียน\|กำหนดเวลา\|การสอบ\|วิชาการ\|วันหยุด",
	"semester": "ภาคการศึกษาที่ระบุ หรือ null",
	"key_terms": ["คำสำคัญ 3 คำที่สำคัญที่สุด"],
	"response_format": "รายการ\|คำตอบเดียว\|คำตอบละเอียด"
	}
	"""
	)

	def _get_default_analysis(self, query: str) -> Dict[str, Any]:
	"""Return default analysis when processing fails"""
	logger.info("Returning default analysis")
	return {
	"original_query": query,
	"event_type": None,
	"semester": None,
	"key_terms": [],
	"response_format": "detailed"
	}

	def process_query(self, query: str) -> Dict[str, Any]:
	"""Enhanced query processing with better error handling."""

	try:
	result = self.prompt_builder.run(query=query)
	response = self.generator.run(prompt=result["prompt"])

	if not response or not response.get("replies") or not response["replies"][0]:
	logger.warning("Received empty response from OpenAI")
	return self._get_default_analysis(query)

	try:
	analysis = json.loads(response["replies"][0])
	except json.JSONDecodeError as je:
	return self._get_default_analysis(query)

	# Ensure course-related queries retrieve study plans & curricula
	course_keywords = ['หน่วยกิต', 'วิชา', 'หลักสูตร', 'แผนการเรียน', 'วิชาเลือก', 'วิชาบังคับ', 'วิชาการค้นคว้า', 'วิชาหลัก']
	if any(keyword in query for keyword in course_keywords):
	analysis['event_type'] = 'curriculum'

	# Ensure fee-related queries retrieve tuition fee documents
	fee_keywords = ['ค่าเทอม', 'ค่าธรรมเนียม', 'ค่าเรียน', 'ค่าปรับ']
	if any(keyword in query for keyword in fee_keywords):
	analysis['event_type'] = 'fees'

	return {
	"original_query": query,
	**analysis
	}

	except Exception as e:
	logger.error(f"Query processing failed: {str(e)}")
	return self._get_default_analysis(query)

	class ResponseGenerator:
	"""Generate responses with better context utilization"""

	def __init__(self, config: PipelineConfig):
	self.generator = OpenAIGenerator(
	api_key=Secret.from_token(config.model.openai_api_key),
	model=config.model.openai_model
	)
	self.prompt_builder = PromptBuilder(
	template="""
	คุณเป็นที่ปรึกษาทางวิชาการ กรุณาตอบคำถามต่อไปนี้โดยใช้ข้อมูลจากปฏิทินการศึกษาที่ให้มา

	คำถาม: {{query}}

	ข้อมูลที่เกี่ยวข้องจากปฏิทินการศึกษา:
	{% for doc in context %}
	---
	{{doc.content}}
	{% endfor %}

	ห้ามเดาข้อมูลเอง ถ้าไม่มีข้อมูลให้ตอบว่า "ไม่มีข้อมูลที่ตรงกับคำถาม"

	กรุณาตอบเป็นภาษาไทย:

	ต้องบอกเสมอว่า หากมีข้อสงสัยเพิ่มเติมสามารถสอบถามได้
	"""
	)

	def generate_response(self,
	query: str,
	documents: List[Document],
	query_info: Dict[str, Any]) -> str:
	"""Generate response using retrieved documents"""
	try:
	result = self.prompt_builder.run(
	query=query,
	context=documents,
	format=query_info["response_format"]
	)

	response = self.generator.run(prompt=result["prompt"])
	return response["replies"][0]

	except Exception as e:
	logger.error(f"Response generation failed: {str(e)}")
	return "ขออภัย ไม่สามารถประมวลผลคำตอบได้ในขณะนี้"

	class AcademicCalendarRAG:
	"""Enhanced RAG system for academic calendar and program information"""

	def __init__(self, config: PipelineConfig):
	self.config = config
	self.document_store = HybridDocumentStore(config) # Use the new hybrid store
	self.query_processor = AdvancedQueryProcessor(config)
	self.response_generator = ResponseGenerator(config)
	self.data_processor = CalendarDataProcessor()

	# Initialize data containers
	self.calendar_events = []
	self.program_details = []
	self.contact_details = []
	self.course_structure = []
	self.study_plans = []
	self.tuition_fees = []

	def load_data(self, json_data: Dict):
	"""Load and process all data sources"""
	try:
	raw_events = self.data_processor.parse_calendar_json(json_data)
	for event in raw_events:
	if not event.event_type:
	event.event_type = CalendarEvent.classify_event_type(event.activity)
	self.calendar_events.append(event)

	# Process other data types
	self.program_details = self.data_processor.extract_program_details(json_data)
	self.contact_details = self.data_processor.extract_contact_details(json_data)
	self.course_structure = self.data_processor.extract_course_structure(json_data)
	self.study_plans = self.data_processor.extract_program_study_plan(json_data)
	self.tuition_fees = self.data_processor.extract_fees(json_data)

	self._add_calendar_events()
	self._add_program_info()

	except Exception as e:
	logger.error(f"Error loading data: {str(e)}")
	raise

	def _add_calendar_events(self):
	"""Add calendar events and other data to document store"""
	if self.calendar_events:
	self.document_store.add_events(
	events=self.calendar_events,
	contact_details=self.contact_details,
	course_structure=self.course_structure,
	study_plans=self.study_plans
	)

	def _add_program_info(self):
	"""Enhanced method to add program-related information to document store"""
	if self.program_details:
	for detail in self.program_details:
	text = f"""
	ข้อมูลการสมัคร:
	เว็บไซต์รับสมัคร: {detail.application_info.application_portal}
	อีเมล: {detail.application_info.program_email}

	เอกสารที่ต้องใช้:
	{self._format_required_docs(detail.required_documents)}

	ขั้นตอนการส่งเอกสาร:
	{detail.submission_process}

	ขั้นตอนการคัดเลือก:
	{self._format_selection_steps(detail.selection_process)}
	"""
	self.document_store.add_document(text, "program_details")

	if self.tuition_fees:
	for fee in self.tuition_fees:
	text = f"""
	ค่าธรรมเนียมการศึกษา:
	ค่าเล่าเรียนปกติ: {fee.regular_fee.amount:,.2f} {fee.regular_fee.currency} {fee.regular_fee.period}
	ค่าปรับชำระล่าช้า: {fee.late_payment_fee.amount:,.2f} {fee.late_payment_fee.currency}
	"""
	self.document_store.add_document(text, "fees")

	def _format_required_docs(self, docs: Dict) -> str:
	"""Format required documents information with detailed English proficiency requirements"""
	result = []

	if 'mandatory' in docs:
	result.append("เอกสารที่ต้องใช้:")
	for doc in docs['mandatory'].values():
	result.append(f"- {doc.name}: {doc.description}")

	if 'optional' in docs:
	result.append("\nเอกสารเพิ่มเติม:")
	for doc_key, doc in docs['optional'].items():
	if doc_key == 'english_proficiency':
	result.append(f"- {doc.name}")
	# Parse and format the accepted tests
	try:
	accepted_tests = eval(doc.description)
	result.append(" เกณฑ์คะแนนที่ยอมรับ:")
	for test, requirement in accepted_tests.items():
	result.append(f" * {test}: {requirement}")
	except:
	result.append(f" {doc.description}")

	if doc.conditions:
	conditions = doc.conditions.split(', ')
	for condition in conditions:
	result.append(f" {condition}")
	else:
	desc = f"- {doc.name}"
	if doc.conditions:
	desc += f" ({doc.conditions})"
	result.append(desc)

	return "\n".join(result)

	def _format_selection_steps(self, steps: List[SelectionStep]) -> str:
	"""Format selection process steps"""
	return "\n".join(f"{step.step_number}. {step.description}" for step in steps)

	def _get_fee_documents(self) -> List[Document]:
	"""Get fee-related documents"""
	if not self.tuition_fees:
	return []

	documents = []
	for fee in self.tuition_fees:
	text = f"""
	ค่าธรรมเนียมการศึกษา:
	- ค่าเล่าเรียน: {fee.regular_fee.amount:,.2f} {fee.regular_fee.currency} {fee.regular_fee.period}
	- ค่าปรับชำระล่าช้า: {fee.late_payment_fee.amount:,.2f} {fee.late_payment_fee.currency}
	"""
	doc = Document(
	content=text,
	meta={"event_type": "fees"}
	)
	documents.append(doc)

	return documents

	def process_query(self, query: str, weight_semantic: float = 0.5) -> Dict[str, Any]:
	"""Process user query using hybrid retrieval"""
	try:
	# Analyze query
	query_info = self.query_processor.process_query(query)

	# Get relevant documents using hybrid search
	documents = self.document_store.hybrid_search(
	query=query,
	event_type=query_info.get("event_type"),
	semester=query_info.get("semester"),
	top_k=self.config.retriever.top_k,
	weight_semantic= 0.3
	)

	# Add fee information for fee-related queries
	if query_info.get("event_type") == "fees" and self.tuition_fees:
	fee_docs = self._get_fee_documents()
	documents.extend(fee_docs)

	# Generate response
	response = self.response_generator.generate_response(
	query=query,
	documents=documents,
	query_info=query_info
	)

	return {
	"query": query,
	"answer": response,
	"relevant_docs": documents,
	"query_info": query_info
	}

	except Exception as e:
	logger.error(f"Error processing query: {str(e)}")
	return {
	"query": query,
	"answer": "ขออภัย ไม่สามารถประมวลผลคำตอบได้ในขณะนี้",
	"error": str(e)
	}
	# def main():
	# """Main function demonstrating hybrid retrieval"""
	# try:
	# # Load API key
	# with open("key.txt", "r") as f:
	# openai_api_key = f.read().strip()

	# # Create config with hybrid retrieval settings
	# config = create_default_config(openai_api_key)
	# pipeline = AcademicCalendarRAG(config)

	# # Load and process data
	# with open("raw-data.json", "r", encoding="utf-8") as f:
	# raw_data = json.load(f)

	# pipeline.load_data(raw_data)

	# # Test queries with different semantic weights
	# queries = ["มีวิชาหลักหรือวิชาเลือกอะไรบ้าง"]

	# print("=" * 80)

	# for query in queries:
	# print(f"\nQuery: {query}")
	# result = pipeline.process_query(query)
	# print(f"Answer: {result['answer']}")
	# print("-" * 40)

	# except Exception as e:
	# logger.error(f"Pipeline execution failed: {str(e)}")
	# raise

	# if __name__ == "__main__":
	# main()