Spaces:

gabykim
/

KnowLang_Transformers_Demo

Sleeping

App Files Files Community

gabykim commited on Feb 7

Commit

1e72ff4

1 Parent(s): eede10d

multiple evaluation rounds for each test case

Browse files

Files changed (2) hide show

src/know_lang_bot/evaluation/chatbot_evaluation.py +52 -29
src/know_lang_bot/evaluation/chatbot_evaluation_visualize.py +31 -11

src/know_lang_bot/evaluation/chatbot_evaluation.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import List, Dict, Optional
 from enum import Enum
 from pydantic import BaseModel, Field, computed_field
 from pydantic_ai import Agent
@@ -45,11 +45,40 @@ class EvalAgentResponse(MetricScores):
     """Raw response from evaluation agent"""
     feedback: str
 class EvalResult(BaseModel):
-    """Evaluation result with scores and feedback"""
     evaluator_model: str
     case: EvalCase
-    eval_response: EvalAgentResponse
 class ChatBotEvaluationContext(EvalCase, ChatResult):
     pass
@@ -100,44 +129,38 @@ Format your response as JSON:
     async def evaluate_single(
         self,
         case: EvalCase,
-        chat_result: ChatResult
     ) -> EvalResult:
-        """Evaluate a single case"""
         # Prepare evaluation context
         eval_context = ChatBotEvaluationContext(
             **case.model_dump(),
             **chat_result.model_dump()
         )
-        # Get evaluation from the model
-        result = await self.eval_agent.run(
-            eval_context.model_dump_json(),
-        )
-        eval_response : EvalAgentResponse = result.data
         return EvalResult(
             case=case,
-            eval_response=eval_response,
             evaluator_model=f"{self.config.evaluator.model_provider}:{self.config.evaluator.model_name}"
         )
-    async def evaluate_batch(
-        self,
-        cases: List[EvalCase],
-        process_chat_func,
-        max_concurrent: int = 2
-    ) -> List[EvalResult]:
-        """Run evaluation on multiple cases with concurrency control"""
-        semaphore = asyncio.Semaphore(max_concurrent)
-        async def eval_single_with_limit(case: EvalCase) -> EvalResult:
-            async with semaphore:
-                chat_result = await process_chat_func(case.question)
-                return await self.evaluate_single(case, chat_result)
-        return await asyncio.gather(
-            *[eval_single_with_limit(case) for case in cases]
-        )
 # src/transformers/quantizers/base.py
 TRANSFORMER_QUANTIZER_BASE_CASES = [
@@ -374,7 +397,7 @@ async def main():
             summary_list.append(eval_summary)
             import time
-            time.sleep(5) # Sleep for 5 seconds to avoid rate limiting
         except Exception:
             console.print_exception()

+from typing import List
 from enum import Enum
 from pydantic import BaseModel, Field, computed_field
 from pydantic_ai import Agent
     """Raw response from evaluation agent"""
     feedback: str
+class EvalRound(BaseModel):
+    """Single evaluation round results"""
+    round_id: int
+    eval_response: EvalAgentResponse
+    timestamp: datetime.datetime
 class EvalResult(BaseModel):
+    """Extended evaluation result with multiple rounds"""
     evaluator_model: str
     case: EvalCase
+    eval_rounds: List[EvalRound]
+    @computed_field
+    def aggregated_scores(self) -> MetricScores:
+        """Calculate mean scores across rounds"""
+        chunk_relevance = EvalMetric.CHUNK_RELEVANCE.value
+        answer_correctness = EvalMetric.ANSWER_CORRECTNESS.value
+        code_reference = EvalMetric.CODE_REFERENCE.value
+        scores = {
+            chunk_relevance: [],
+            answer_correctness: [],
+            code_reference: []
+        }
+        for round in self.eval_rounds:
+            for metric in scores.keys():
+                scores[metric].append(getattr(round.eval_response, metric))
+        return MetricScores(
+            chunk_relevance=sum(scores[chunk_relevance]) / len(self.eval_rounds),
+            answer_correctness=sum(scores[answer_correctness]) / len(self.eval_rounds),
+            code_reference=sum(scores[code_reference]) / len(self.eval_rounds)
+        )
 class ChatBotEvaluationContext(EvalCase, ChatResult):
     pass
     async def evaluate_single(
         self,
         case: EvalCase,
+        chat_result: ChatResult,
+        num_rounds: int = 1,
     ) -> EvalResult:
+        """Evaluate a single case for multiple rounds"""
+        eval_rounds = []
         # Prepare evaluation context
         eval_context = ChatBotEvaluationContext(
             **case.model_dump(),
             **chat_result.model_dump()
         )
+        for round_id in range(num_rounds):
+            # Get evaluation from the model
+            result = await self.eval_agent.run(
+                eval_context.model_dump_json(),
+            )
+            eval_rounds.append(EvalRound(
+                round_id=round_id,
+                eval_response=result.data,
+                timestamp=datetime.datetime.now()
+            ))
+            # Add delay between rounds to avoid rate limits
+            await asyncio.sleep(2)
         return EvalResult(
             case=case,
+            eval_rounds=eval_rounds,
             evaluator_model=f"{self.config.evaluator.model_provider}:{self.config.evaluator.model_name}"
         )
 # src/transformers/quantizers/base.py
 TRANSFORMER_QUANTIZER_BASE_CASES = [
             summary_list.append(eval_summary)
             import time
+            time.sleep(3) # Sleep for 5 seconds to avoid rate limiting
         except Exception:
             console.print_exception()

src/know_lang_bot/evaluation/chatbot_evaluation_visualize.py CHANGED Viewed

@@ -11,6 +11,7 @@ class RetrievalMethod(str, Enum):
     EMBEDDING = "embedding"
     EMBEDDING_RERANKING = "embedding_reranking"
     EMBEDDING_WITH_CODE = "embedding_with_code"
     EMBEDDING_RERANKING_WITH_CODE = "embedding_reranking_with_code"
 class ResultAnalyzer:
@@ -21,9 +22,10 @@ class ResultAnalyzer:
         # Map each method to its directory
         self.method_dirs = {
             RetrievalMethod.EMBEDDING: self.base_dir / RetrievalMethod.EMBEDDING.value,
-            RetrievalMethod.EMBEDDING_RERANKING: self.base_dir / RetrievalMethod.EMBEDDING_RERANKING.value,
             RetrievalMethod.EMBEDDING_WITH_CODE: self.base_dir / RetrievalMethod.EMBEDDING_WITH_CODE.value,
-            RetrievalMethod.EMBEDDING_RERANKING_WITH_CODE: self.base_dir / RetrievalMethod.EMBEDDING_RERANKING_WITH_CODE.value,
         }
     def load_results(self, file_path: Path) -> List[EvalSummary]:
@@ -36,17 +38,26 @@ class ResultAnalyzer:
         """Convert results to pandas DataFrame with flattened metrics"""
         rows = []
         for result in results:
-            row = {
                 "evaluator_model": result.evaluator_model,
                 "question": result.case.question,
                 "difficulty": result.case.difficulty,
-                "chunk_relevance": result.eval_response.chunk_relevance,
-                "answer_correctness": result.eval_response.answer_correctness,
-                "code_reference": result.eval_response.code_reference,
-                "weighted_total": result.eval_response.weighted_total,
-                "environment": getattr(result.case, 'environment', 'default')  # Added environment
             }
-            rows.append(row)
         return pd.DataFrame(rows)
@@ -71,8 +82,17 @@ class ResultAnalyzer:
         return f"{improvement:+.1f}%" if improvement else "0%"
     def get_stats_by_group(self, df: pd.DataFrame, group_by: str) -> pd.DataFrame:
-        """Calculate statistics grouped by specified column"""
-        return df.groupby(group_by).agg({
             "chunk_relevance": ["mean", "std"],
             "answer_correctness": ["mean", "std"],
             "code_reference": ["mean", "std"],

     EMBEDDING = "embedding"
     EMBEDDING_RERANKING = "embedding_reranking"
     EMBEDDING_WITH_CODE = "embedding_with_code"
+    OPENAI_EMBEDDING_WITH_CODE = "openai_embedding_with_code"
     EMBEDDING_RERANKING_WITH_CODE = "embedding_reranking_with_code"
 class ResultAnalyzer:
         # Map each method to its directory
         self.method_dirs = {
             RetrievalMethod.EMBEDDING: self.base_dir / RetrievalMethod.EMBEDDING.value,
+            # RetrievalMethod.EMBEDDING_RERANKING: self.base_dir / RetrievalMethod.EMBEDDING_RERANKING.value,
             RetrievalMethod.EMBEDDING_WITH_CODE: self.base_dir / RetrievalMethod.EMBEDDING_WITH_CODE.value,
+            RetrievalMethod.OPENAI_EMBEDDING_WITH_CODE: self.base_dir / RetrievalMethod.OPENAI_EMBEDDING_WITH_CODE.value,
+            # RetrievalMethod.EMBEDDING_RERANKING_WITH_CODE: self.base_dir / RetrievalMethod.EMBEDDING_RERANKING_WITH_CODE.value,
         }
     def load_results(self, file_path: Path) -> List[EvalSummary]:
         """Convert results to pandas DataFrame with flattened metrics"""
         rows = []
         for result in results:
+            # Basic metrics
+            base_row = {
                 "evaluator_model": result.evaluator_model,
                 "question": result.case.question,
                 "difficulty": result.case.difficulty,
+                "environment": getattr(result.case, 'environment', 'default')
             }
+            # Add metrics for each round
+            for round in result.eval_rounds:
+                row = base_row.copy()
+                row.update({
+                    "round_id": round.round_id,
+                    "chunk_relevance": round.eval_response.chunk_relevance,
+                    "answer_correctness": round.eval_response.answer_correctness,
+                    "code_reference": round.eval_response.code_reference,
+                    "weighted_total": round.eval_response.weighted_total,
+                    "timestamp": round.timestamp
+                })
+                rows.append(row)
         return pd.DataFrame(rows)
         return f"{improvement:+.1f}%" if improvement else "0%"
     def get_stats_by_group(self, df: pd.DataFrame, group_by: str) -> pd.DataFrame:
+        """Calculate statistics with round variance"""
+        # First get mean per question/round
+        question_means = df.groupby([group_by, "question"]).agg({
+            "chunk_relevance": "mean",
+            "answer_correctness": "mean",
+            "code_reference": "mean",
+            "weighted_total": "mean"
+        })
+        # Then get mean and std across questions
+        return question_means.groupby(level=0).agg({
             "chunk_relevance": ["mean", "std"],
             "answer_correctness": ["mean", "std"],
             "code_reference": ["mean", "std"],