FauziIsyrinApridal commited on
Commit
22f049b
Β·
1 Parent(s): 886eee7

remove supabase parameter from get timestamp and fix evaluate stil asking for billing though

Browse files
Files changed (3) hide show
  1. app.py +3 -3
  2. evaluate.py +126 -339
  3. rag_evaluation_20250627_133749.log +0 -0
app.py CHANGED
@@ -26,9 +26,9 @@ VECTOR_STORE_PREFIX = "vector_store"
26
  # ---------------------------------------------------------
27
  # ⚑️ UTILITY
28
  # ---------------------------------------------------------
29
- def get_latest_data_timestamp_from_files(bucket_name: str, supabase) -> float:
30
  """Get the latest timestamp from files in a Supabase storage bucket."""
31
- files = list_all_files(bucket_name, supabase=supabase)
32
  latest_time = 0.0
33
  for file in files:
34
  iso_time = file.get("updated_at") or file.get("created_at")
@@ -65,7 +65,7 @@ def vector_store_is_outdated() -> bool:
65
  if supabase_timestamp is None:
66
  return True
67
  supabase_time = datetime.fromisoformat(supabase_timestamp.replace("Z", "+00:00")).timestamp()
68
- data_time = get_latest_data_timestamp_from_files("pnp-bot-storage", supabase)
69
 
70
  return data_time > supabase_time
71
 
 
26
  # ---------------------------------------------------------
27
  # ⚑️ UTILITY
28
  # ---------------------------------------------------------
29
+ def get_latest_data_timestamp_from_files(bucket_name: str) -> float:
30
  """Get the latest timestamp from files in a Supabase storage bucket."""
31
+ files = list_all_files(bucket_name)
32
  latest_time = 0.0
33
  for file in files:
34
  iso_time = file.get("updated_at") or file.get("created_at")
 
65
  if supabase_timestamp is None:
66
  return True
67
  supabase_time = datetime.fromisoformat(supabase_timestamp.replace("Z", "+00:00")).timestamp()
68
+ data_time = get_latest_data_timestamp_from_files("pnp-bot-storage")
69
 
70
  return data_time > supabase_time
71
 
evaluate.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  import time
3
  import random
4
  import logging
@@ -21,28 +22,32 @@ from app.document_processor import load_vector_store_from_supabase
21
  from app.prompts import sahabat_prompt
22
  from app.db import supabase
23
 
24
- # Setup logging
 
 
 
 
 
 
 
25
  logging.basicConfig(
26
  level=logging.INFO,
27
  format='%(asctime)s - %(levelname)s - %(message)s',
28
  handlers=[
29
- logging.FileHandler(f'rag_evaluation_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
30
- logging.StreamHandler()
31
  ]
32
  )
33
  logger = logging.getLogger(__name__)
34
 
 
35
  load_dotenv()
36
-
37
- # Konfigurasi
38
  BUCKET_NAME = "pnp-bot-storage-archive"
39
  VECTOR_STORE_PREFIX = "vector_store"
40
-
41
- # Rate limiting settings
42
  MAX_CALLS_PER_MINUTE = 50
43
  MAX_CALLS_PER_HOUR = 1000
44
 
45
- # Dataset evaluasi untuk Politeknik Negeri Padang
46
  evaluation_dataset = [
47
  {
48
  'question': '''Bagaimana sistem pendidikan yang diterapkan di Politeknik Negeri Padang?''',
@@ -90,6 +95,7 @@ evaluation_dataset = [
90
  }
91
  ]
92
 
 
93
  # Schema untuk evaluasi
94
  class CorrectnessGrade(TypedDict):
95
  explanation: Annotated[str, ..., "Penjelasan alasan penilaian"]
@@ -170,7 +176,7 @@ Nilai relevansi False berarti FAKTA sama sekali tidak terkait dengan PERTANYAAN.
170
  Jelaskan penalaran Anda secara bertahap untuk memastikan penalaran dan kesimpulan benar.
171
  Hindari menyebutkan jawaban benar di awal."""
172
 
173
- # Inisialisasi evaluator LLM dengan retry dan rate limiting
174
  class SafeLLMEvaluator:
175
  def __init__(self, model_name="gpt-4o", temperature=0):
176
  self.model_name = model_name
@@ -178,350 +184,131 @@ class SafeLLMEvaluator:
178
  self._init_llms()
179
 
180
  def _init_llms(self):
181
- """Initialize LLM evaluators with structured output"""
182
- try:
183
- self.grader_llm = ChatOpenAI(
184
- model=self.model_name,
185
- temperature=self.temperature
186
- ).with_structured_output(CorrectnessGrade, method="json_schema", strict=True)
187
-
188
- self.relevance_llm = ChatOpenAI(
189
- model=self.model_name,
190
- temperature=self.temperature
191
- ).with_structured_output(RelevanceGrade, method="json_schema", strict=True)
192
-
193
- self.grounded_llm = ChatOpenAI(
194
- model=self.model_name,
195
- temperature=self.temperature
196
- ).with_structured_output(GroundedGrade, method="json_schema", strict=True)
197
-
198
- self.retrieval_relevance_llm = ChatOpenAI(
199
- model=self.model_name,
200
- temperature=self.temperature
201
- ).with_structured_output(RetrievalRelevanceGrade, method="json_schema", strict=True)
202
-
203
- logger.info(f"βœ… LLM evaluators initialized with model: {self.model_name}")
204
-
205
- except Exception as e:
206
- logger.error(f"❌ Failed to initialize LLM evaluators: {e}")
207
- raise
208
 
209
- # Global evaluator instance
210
  evaluator = SafeLLMEvaluator()
211
 
212
- # Rate limiting and retry decorators
213
  @sleep_and_retry
214
  @limits(calls=MAX_CALLS_PER_MINUTE, period=60)
215
- @backoff.on_exception(
216
- backoff.expo,
217
- (Exception,),
218
- max_tries=3,
219
- max_time=30,
220
- jitter=backoff.random_jitter
221
- )
222
  def safe_api_call(llm, messages):
223
- """Safely make API calls with rate limiting and retry"""
224
- try:
225
- response = llm.invoke(messages)
226
- logger.debug(f"βœ… API call successful")
227
- return response
228
- except Exception as e:
229
- logger.warning(f"⚠️ API call failed: {e}")
230
- raise
231
 
 
232
  @traceable(name="Create RAG Chain for Evaluation")
233
  def create_rag_chain(vector_store):
234
- """Membuat RAG chain untuk evaluasi dengan optimasi"""
235
- try:
236
- llm = Replicate(
237
- model="fauziisyrinapridal/sahabat-ai-v1:afb9fa89fe786362f619fd4fef34bd1f7a4a4da23073d8a6fbf54dcbe458f216",
238
- model_kwargs={"temperature": 0.1, "top_p": 0.9, "max_new_tokens": 4000} # Reduced tokens
239
- )
240
-
241
- memory = ConversationBufferMemory(
242
- memory_key="chat_history",
243
- return_messages=True,
244
- output_key='answer'
245
- )
246
-
247
- # Reduced retrieval count to minimize API calls
248
- chain = ConversationalRetrievalChain.from_llm(
249
- llm,
250
- retriever=vector_store.as_retriever(search_kwargs={"k": 4}), # Reduced from 6 to 4
251
- combine_docs_chain_kwargs={"prompt": sahabat_prompt},
252
- return_source_documents=True,
253
- memory=memory
254
- )
255
-
256
- logger.info("βœ… RAG chain created successfully")
257
- return chain
258
-
259
- except Exception as e:
260
- logger.error(f"❌ Failed to create RAG chain: {e}")
261
- raise
262
 
263
  @traceable(name="RAG Bot Answer")
264
- @backoff.on_exception(backoff.expo, Exception, max_tries=3)
265
  def rag_bot_answer(question: str, vector_store) -> dict:
266
- """Fungsi untuk mendapatkan jawaban dari RAG bot dengan error handling"""
267
- try:
268
- chain = create_rag_chain(vector_store)
269
- result = chain({"question": question})
270
-
271
- logger.info(f"βœ… RAG answer generated for question: {question[:50]}...")
272
- return {
273
- "answer": result['answer'],
274
- "documents": result.get('source_documents', [])
275
- }
276
- except Exception as e:
277
- logger.error(f"❌ Error in rag_bot_answer: {e}")
278
- return {
279
- "answer": "Terjadi kesalahan dalam memproses pertanyaan.",
280
- "documents": []
281
- }
282
-
283
- # Enhanced evaluator functions with rate limiting
284
- def correctness_evaluator(question: str, answer: str, ground_truth: str) -> tuple[bool, str]:
285
- """Evaluator untuk kebenaran jawaban dengan error handling"""
286
- try:
287
- answers = f"""PERTANYAAN: {question}
288
- JAWABAN BENAR: {ground_truth}
289
- JAWABAN SISWA: {answer}"""
290
-
291
- messages = [
292
- {"role": "system", "content": correctness_instructions},
293
- {"role": "user", "content": answers}
294
- ]
295
-
296
- grade = safe_api_call(evaluator.grader_llm, messages)
297
- logger.debug(f"βœ… Correctness evaluation completed")
298
- return grade["correct"], grade["explanation"]
299
-
300
- except Exception as e:
301
- logger.error(f"❌ Correctness evaluation failed: {e}")
302
- return False, f"Error in evaluation: {str(e)}"
303
-
304
- def relevance_evaluator(question: str, answer: str) -> tuple[bool, str]:
305
- """Evaluator untuk relevansi jawaban dengan error handling"""
306
- try:
307
- content = f"PERTANYAAN: {question}\nJAWABAN SISWA: {answer}"
308
- messages = [
309
- {"role": "system", "content": relevance_instructions},
310
- {"role": "user", "content": content}
311
- ]
312
-
313
- grade = safe_api_call(evaluator.relevance_llm, messages)
314
- logger.debug(f"βœ… Relevance evaluation completed")
315
- return grade["relevant"], grade["explanation"]
316
-
317
- except Exception as e:
318
- logger.error(f"❌ Relevance evaluation failed: {e}")
319
- return False, f"Error in evaluation: {str(e)}"
320
-
321
- def groundedness_evaluator(answer: str, documents) -> tuple[bool, str]:
322
- """Evaluator untuk groundedness jawaban dengan error handling"""
323
- try:
324
- if not documents:
325
- return False, "No documents provided for grounding evaluation"
326
-
327
- doc_string = "\n\n".join([doc.page_content for doc in documents])
328
- content = f"FAKTA: {doc_string}\nJAWABAN SISWA: {answer}"
329
- messages = [
330
- {"role": "system", "content": grounded_instructions},
331
- {"role": "user", "content": content}
332
- ]
333
-
334
- grade = safe_api_call(evaluator.grounded_llm, messages)
335
- logger.debug(f"βœ… Groundedness evaluation completed")
336
- return grade["grounded"], grade["explanation"]
337
-
338
- except Exception as e:
339
- logger.error(f"❌ Groundedness evaluation failed: {e}")
340
- return False, f"Error in evaluation: {str(e)}"
341
-
342
- def retrieval_relevance_evaluator(question: str, documents) -> tuple[bool, str]:
343
- """Evaluator untuk relevansi retrieval dengan error handling"""
344
- try:
345
- if not documents:
346
- return False, "No documents provided for retrieval evaluation"
347
-
348
- doc_string = "\n\n".join([doc.page_content for doc in documents])
349
- content = f"FAKTA: {doc_string}\nPERTANYAAN: {question}"
350
- messages = [
351
- {"role": "system", "content": retrieval_relevance_instructions},
352
- {"role": "user", "content": content}
353
- ]
354
-
355
- grade = safe_api_call(evaluator.retrieval_relevance_llm, messages)
356
- logger.debug(f"βœ… Retrieval relevance evaluation completed")
357
- return grade["relevant"], grade["explanation"]
358
-
359
- except Exception as e:
360
- logger.error(f"❌ Retrieval relevance evaluation failed: {e}")
361
- return False, f"Error in evaluation: {str(e)}"
362
-
363
- def controlled_delay(min_delay=2, max_delay=5):
364
- """Add controlled delay to avoid rate limits"""
365
- delay = random.uniform(min_delay, max_delay)
366
- logger.debug(f"⏳ Waiting {delay:.2f} seconds...")
367
- time.sleep(delay)
368
-
369
  @traceable(name="Run RAG Evaluation Enhanced")
370
- def run_enhanced_evaluation(batch_size: int = None, start_index: int = 0):
371
- """Menjalankan evaluasi RAG dengan optimasi dan monitoring"""
372
- logger.info("πŸš€ Memulai evaluasi RAG Enhanced untuk Politeknik Negeri Padang...")
373
-
374
- # Load vector store
375
- logger.info("πŸ“š Memuat vector store dari Supabase...")
376
- try:
377
- vector_store = load_vector_store_from_supabase(supabase, BUCKET_NAME, VECTOR_STORE_PREFIX)
378
- if not vector_store:
379
- logger.error("❌ Gagal memuat vector store!")
380
- return None
381
- logger.info("βœ… Vector store berhasil dimuat!")
382
- except Exception as e:
383
- logger.error(f"❌ Error loading vector store: {e}")
384
- return None
385
-
386
- # Determine evaluation scope
387
- if batch_size:
388
- end_index = min(start_index + batch_size, len(evaluation_dataset))
389
- dataset_subset = evaluation_dataset[start_index:end_index]
390
- logger.info(f"πŸ“Š Evaluating batch {start_index}-{end_index-1} ({len(dataset_subset)} questions)")
391
- else:
392
- dataset_subset = evaluation_dataset
393
- logger.info(f"πŸ“Š Evaluating all {len(dataset_subset)} questions")
394
-
395
- # Hasil evaluasi
396
  results = []
397
- total_questions = len(dataset_subset)
398
- start_time = time.time()
399
-
400
- # Progress tracking
401
- success_count = 0
402
- error_count = 0
403
-
404
- for i, item in enumerate(dataset_subset, 1):
405
- question_start_time = time.time()
406
- logger.info(f"\nπŸ” Evaluasi pertanyaan {i}/{total_questions}")
407
-
408
- question = item['question']
409
- ground_truth = item['ground_truth']
410
-
411
  try:
412
- # Dapatkan jawaban dari RAG
413
- logger.info(f"πŸ€– Getting RAG answer...")
414
  rag_result = rag_bot_answer(question, vector_store)
415
- answer = rag_result['answer']
416
- documents = rag_result['documents']
417
-
418
- logger.info(f"❓ Pertanyaan: {question[:100]}...")
419
- logger.info(f"πŸ’¬ Jawaban: {answer[:100]}...")
420
- logger.info(f"πŸ“„ Dokumen ditemukan: {len(documents)}")
421
-
422
- # Add delay before evaluations
423
- controlled_delay(1, 3)
424
-
425
- # Evaluasi dengan error handling
426
- logger.info("πŸ“ˆ Running evaluations...")
427
-
428
- correctness_score, correctness_explanation = correctness_evaluator(question, answer, ground_truth)
429
- controlled_delay(1, 2)
430
-
431
- relevance_score, relevance_explanation = relevance_evaluator(question, answer)
432
- controlled_delay(1, 2)
433
-
434
- groundedness_score, groundedness_explanation = groundedness_evaluator(answer, documents)
435
- controlled_delay(1, 2)
436
-
437
- retrieval_relevance_score, retrieval_explanation = retrieval_relevance_evaluator(question, documents)
438
-
439
- result = {
440
- 'question_index': start_index + i,
441
- 'question': question,
442
- 'answer': answer,
443
- 'ground_truth': ground_truth,
444
- 'documents_count': len(documents),
445
- 'correctness': correctness_score,
446
- 'correctness_explanation': correctness_explanation,
447
- 'relevance': relevance_score,
448
- 'relevance_explanation': relevance_explanation,
449
- 'groundedness': groundedness_score,
450
- 'groundedness_explanation': groundedness_explanation,
451
- 'retrieval_relevance': retrieval_relevance_score,
452
- 'retrieval_explanation': retrieval_explanation,
453
- 'processing_time': time.time() - question_start_time
454
- }
455
-
456
- results.append(result)
457
- success_count += 1
458
-
459
- logger.info(f"πŸ“ˆ Skor - Benar: {correctness_score}, Relevan: {relevance_score}, "
460
- f"Berdasarkan Dokumen: {groundedness_score}, Retrieval Relevan: {retrieval_relevance_score}")
461
- logger.info(f"⏱️ Waktu pemrosesan: {result['processing_time']:.2f} detik")
462
-
463
  except Exception as e:
464
- error_count += 1
465
- logger.error(f"❌ Error processing question {i}: {e}")
466
-
467
- # Create error result
468
- error_result = {
469
- 'question_index': start_index + i,
470
- 'question': question,
471
- 'answer': "ERROR",
472
- 'ground_truth': ground_truth,
473
- 'documents_count': 0,
474
- 'correctness': False,
475
- 'correctness_explanation': f"Error: {str(e)}",
476
- 'relevance': False,
477
- 'relevance_explanation': f"Error: {str(e)}",
478
- 'groundedness': False,
479
- 'groundedness_explanation': f"Error: {str(e)}",
480
- 'retrieval_relevance': False,
481
- 'retrieval_explanation': f"Error: {str(e)}",
482
- 'processing_time': time.time() - question_start_time
483
- }
484
- results.append(error_result)
485
-
486
- # Progress update
487
- elapsed_time = time.time() - start_time
488
- avg_time_per_question = elapsed_time / i
489
- estimated_total_time = avg_time_per_question * total_questions
490
- remaining_time = estimated_total_time - elapsed_time
491
-
492
- logger.info(f"πŸ“Š Progress: {i}/{total_questions} ({i/total_questions*100:.1f}%)")
493
- logger.info(f"⏱️ Waktu berlalu: {elapsed_time:.1f}s, Estimasi sisa: {remaining_time:.1f}s")
494
-
495
- # Add delay between questions
496
- if i < total_questions:
497
- controlled_delay(2, 4)
498
-
499
- # Hitung statistik keseluruhan
500
- total_time = time.time() - start_time
501
- successful_results = [r for r in results if r['answer'] != "ERROR"]
502
-
503
- if successful_results:
504
- total_correctness = sum(r['correctness'] for r in successful_results)
505
- total_relevance = sum(r['relevance'] for r in successful_results)
506
- total_groundedness = sum(r['groundedness'] for r in successful_results)
507
- total_retrieval_relevance = sum(r['retrieval_relevance'] for r in successful_results)
508
- successful_count = len(successful_results)
509
- else:
510
- total_correctness = total_relevance = total_groundedness = total_retrieval_relevance = 0
511
- successful_count = 0
512
-
513
- # Print results
514
- logger.info(f"\nπŸ“Š HASIL EVALUASI ENHANCED:")
515
- logger.info(f"{'='*60}")
516
- logger.info(f"Total Pertanyaan: {total_questions}")
517
- logger.info(f"Berhasil Diproses: {success_count}")
518
- logger.info(f"Error: {error_count}")
519
- logger.info(f"Total Waktu: {total_time:.1f} detik ({total_time/60:.1f} menit)")
520
- logger.info(f"Rata-rata per Pertanyaan: {total_time/total_questions:.1f} detik")
521
-
522
- if successful_count > 0:
523
- logger.info(f"\n🎯 SKOR EVALUASI (dari {successful_count} pertanyaan berhasil):")
524
- logger.info(f"Kebenaran (Correctness): {total_correctness}/{successful_count} ({total_correctness/successful_count*100:.1f}%)")
525
- logger.info(f"Relevansi (Relevance): {total_relevance}/{successful_count} ({total_relevance/successful_count*100:.1f}%)")
526
- logger.info(f"Berdasarkan Dokumen (Groundedness): {total_groundedness}/{successful_count} ({total_groundedness/successful_count*100:.1f}%)")
527
- logger.info(f"Retrieval Relevan: {total_retrieval_relevance}/{successful_count} ({total_retrieval_relevance/successful_count*100:.1f}%)")
 
1
  import os
2
+ import sys
3
  import time
4
  import random
5
  import logging
 
22
  from app.prompts import sahabat_prompt
23
  from app.db import supabase
24
 
25
+ # === Logging UTF-8 Safe ===
26
+ class UTF8StreamHandler(logging.StreamHandler):
27
+ def __init__(self, stream=None):
28
+ if stream is None:
29
+ stream = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=1)
30
+ super().__init__(stream)
31
+
32
+ log_filename = f'rag_evaluation_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
33
  logging.basicConfig(
34
  level=logging.INFO,
35
  format='%(asctime)s - %(levelname)s - %(message)s',
36
  handlers=[
37
+ logging.FileHandler(log_filename, encoding='utf-8'),
38
+ UTF8StreamHandler()
39
  ]
40
  )
41
  logger = logging.getLogger(__name__)
42
 
43
+ # === Konfigurasi ===
44
  load_dotenv()
 
 
45
  BUCKET_NAME = "pnp-bot-storage-archive"
46
  VECTOR_STORE_PREFIX = "vector_store"
 
 
47
  MAX_CALLS_PER_MINUTE = 50
48
  MAX_CALLS_PER_HOUR = 1000
49
 
50
+ # === Dataset evaluasi ===
51
  evaluation_dataset = [
52
  {
53
  'question': '''Bagaimana sistem pendidikan yang diterapkan di Politeknik Negeri Padang?''',
 
95
  }
96
  ]
97
 
98
+
99
  # Schema untuk evaluasi
100
  class CorrectnessGrade(TypedDict):
101
  explanation: Annotated[str, ..., "Penjelasan alasan penilaian"]
 
176
  Jelaskan penalaran Anda secara bertahap untuk memastikan penalaran dan kesimpulan benar.
177
  Hindari menyebutkan jawaban benar di awal."""
178
 
179
+ # === Evaluator ===
180
  class SafeLLMEvaluator:
181
  def __init__(self, model_name="gpt-4o", temperature=0):
182
  self.model_name = model_name
 
184
  self._init_llms()
185
 
186
  def _init_llms(self):
187
+ self.grader_llm = ChatOpenAI(model=self.model_name, temperature=self.temperature).with_structured_output(CorrectnessGrade, method="json_schema", strict=True)
188
+ self.relevance_llm = ChatOpenAI(model=self.model_name, temperature=self.temperature).with_structured_output(RelevanceGrade, method="json_schema", strict=True)
189
+ self.grounded_llm = ChatOpenAI(model=self.model_name, temperature=self.temperature).with_structured_output(GroundedGrade, method="json_schema", strict=True)
190
+ self.retrieval_relevance_llm = ChatOpenAI(model=self.model_name, temperature=self.temperature).with_structured_output(RetrievalRelevanceGrade, method="json_schema", strict=True)
191
+ logger.info(f"βœ… LLM evaluators initialized with model: {self.model_name}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
 
193
  evaluator = SafeLLMEvaluator()
194
 
195
+ # === Rate Limiting & Retry ===
196
  @sleep_and_retry
197
  @limits(calls=MAX_CALLS_PER_MINUTE, period=60)
198
+ @backoff.on_exception(backoff.expo, (Exception,), max_tries=3)
 
 
 
 
 
 
199
  def safe_api_call(llm, messages):
200
+ return llm.invoke(messages)
 
 
 
 
 
 
 
201
 
202
+ # === RAG Chain ===
203
  @traceable(name="Create RAG Chain for Evaluation")
204
  def create_rag_chain(vector_store):
205
+ llm = Replicate(
206
+ model="fauziisyrinapridal/sahabat-ai-v1:afb9fa89fe786362f619fd4fef34bd1f7a4a4da23073d8a6fbf54dcbe458f216",
207
+ model_kwargs={"temperature": 0.1, "top_p": 0.9, "max_new_tokens": 4000},
208
+ replicate_api_token=os.getenv("REPLICATE_API_TOKEN"),
209
+
210
+ )
211
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key='answer')
212
+ chain = ConversationalRetrievalChain.from_llm(
213
+ llm, retriever=vector_store.as_retriever(search_kwargs={"k": 4}),
214
+ combine_docs_chain_kwargs={"prompt": sahabat_prompt},
215
+ return_source_documents=True, memory=memory
216
+ )
217
+ return chain
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
  @traceable(name="RAG Bot Answer")
 
220
  def rag_bot_answer(question: str, vector_store) -> dict:
221
+ chain = create_rag_chain(vector_store)
222
+ result = chain({"question": question})
223
+ return {"answer": result['answer'], "documents": result.get('source_documents', [])}
224
+
225
+ # === Evaluator Functions ===
226
+ def correctness_evaluator(question, answer, ground_truth):
227
+ messages = [{"role": "system", "content": correctness_instructions},
228
+ {"role": "user", "content": f"PERTANYAAN: {question}\nJAWABAN BENAR: {ground_truth}\nJAWABAN SISWA: {answer}"}]
229
+ grade = safe_api_call(evaluator.grader_llm, messages)
230
+ return grade["correct"], grade["explanation"]
231
+
232
+ def relevance_evaluator(question, answer):
233
+ messages = [{"role": "system", "content": relevance_instructions},
234
+ {"role": "user", "content": f"PERTANYAAN: {question}\nJAWABAN SISWA: {answer}"}]
235
+ grade = safe_api_call(evaluator.relevance_llm, messages)
236
+ return grade["relevant"], grade["explanation"]
237
+
238
+ def groundedness_evaluator(answer, documents):
239
+ doc_string = "\n\n".join([doc.page_content for doc in documents])
240
+ messages = [{"role": "system", "content": grounded_instructions},
241
+ {"role": "user", "content": f"FAKTA: {doc_string}\nJAWABAN SISWA: {answer}"}]
242
+ grade = safe_api_call(evaluator.grounded_llm, messages)
243
+ return grade["grounded"], grade["explanation"]
244
+
245
+ def retrieval_relevance_evaluator(question, documents):
246
+ doc_string = "\n\n".join([doc.page_content for doc in documents])
247
+ messages = [{"role": "system", "content": retrieval_relevance_instructions},
248
+ {"role": "user", "content": f"FAKTA: {doc_string}\nPERTANYAAN: {question}"}]
249
+ grade = safe_api_call(evaluator.retrieval_relevance_llm, messages)
250
+ return grade["relevant"], grade["explanation"]
251
+
252
+ # === Delay helper ===
253
+ def controlled_delay(min_delay=1, max_delay=3):
254
+ time.sleep(random.uniform(min_delay, max_delay))
255
+
256
+ # === Evaluation Runner ===
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  @traceable(name="Run RAG Evaluation Enhanced")
258
+ def run_enhanced_evaluation():
259
+ logger.info("πŸš€ Starting evaluation...")
260
+ vector_store = load_vector_store_from_supabase(supabase, BUCKET_NAME, VECTOR_STORE_PREFIX)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  results = []
262
+
263
+ for idx, item in enumerate(evaluation_dataset, 1):
264
+ question = item["question"]
265
+ ground_truth = item["ground_truth"]
266
+
 
 
 
 
 
 
 
 
 
267
  try:
 
 
268
  rag_result = rag_bot_answer(question, vector_store)
269
+ answer = rag_result["answer"]
270
+ documents = rag_result["documents"]
271
+
272
+ correctness, correctness_exp = correctness_evaluator(question, answer, ground_truth)
273
+ relevance, relevance_exp = relevance_evaluator(question, answer)
274
+ grounded, grounded_exp = groundedness_evaluator(answer, documents)
275
+ retrieval, retrieval_exp = retrieval_relevance_evaluator(question, documents)
276
+
277
+ results.append({
278
+ "question": question,
279
+ "answer": answer,
280
+ "correctness": correctness,
281
+ "correctness_explanation": correctness_exp,
282
+ "relevance": relevance,
283
+ "relevance_explanation": relevance_exp,
284
+ "groundedness": grounded,
285
+ "groundedness_explanation": grounded_exp,
286
+ "retrieval_relevance": retrieval,
287
+ "retrieval_explanation": retrieval_exp,
288
+ })
289
+
290
+ logger.info(f"[{idx}] βœ… Done: {question[:50]}...")
291
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  except Exception as e:
293
+ logger.error(f"❌ Error on Q{idx}: {e}")
294
+ results.append({
295
+ "question": question,
296
+ "answer": "ERROR",
297
+ "correctness": False,
298
+ "correctness_explanation": str(e),
299
+ "relevance": False,
300
+ "relevance_explanation": str(e),
301
+ "groundedness": False,
302
+ "groundedness_explanation": str(e),
303
+ "retrieval_relevance": False,
304
+ "retrieval_explanation": str(e),
305
+ })
306
+
307
+ controlled_delay()
308
+
309
+ logger.info("🎯 Evaluation finished")
310
+ return results
311
+
312
+ # === Jalankan saat script dieksekusi langsung ===
313
+ if __name__ == "__main__":
314
+ run_enhanced_evaluation()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
rag_evaluation_20250627_133749.log DELETED
File without changes