Doc-chat / api /analysis.py
Rulga's picture
Refactor report generation in LogAnalyzer to improve readability and handle empty logs
2d64665
# analysis.py
import json
import pandas as pd
from collections import defaultdict
from typing import List, Dict
from datetime import datetime
class LogAnalyzer:
def __init__(self, log_path: str = "chat_history/chat_logs.json"):
self.log_path = log_path
self.logs = self._load_logs()
def _load_logs(self) -> List[Dict]:
"""Load and parse log entries from JSON file"""
try:
with open(self.log_path, "r", encoding="utf-8") as f:
return [json.loads(line) for line in f]
except (FileNotFoundError, json.JSONDecodeError):
return []
def get_basic_stats(self) -> Dict:
"""Calculate basic conversation statistics"""
if not self.logs:
return {}
return {
"total_interactions": len(self.logs),
"unique_users": len({log.get('session_id') for log in self.logs}),
"avg_response_length": pd.Series([len(log['bot_response']) for log in self.logs]).mean(),
"most_common_questions": self._get_common_questions(),
"knowledge_base_usage": self._calculate_kb_usage()
}
def _get_common_questions(self, top_n: int = 5) -> List[Dict]:
"""Identify most frequent user questions"""
question_counts = defaultdict(int)
for log in self.logs:
question_counts[log['user_input']] += 1
return sorted(
[{"question": k, "count": v} for k, v in question_counts.items()],
key=lambda x: x["count"],
reverse=True
)[:top_n]
def _calculate_kb_usage(self) -> Dict:
"""Analyze knowledge base effectiveness"""
context_usage = defaultdict(int)
for log in self.logs:
if log.get('context'):
context_usage['with_context'] += 1
else:
context_usage['without_context'] += 1
return context_usage
def temporal_analysis(self) -> Dict:
"""Analyze usage patterns over time"""
df = pd.DataFrame(self.logs)
df['timestamp'] = pd.to_datetime(df['timestamp'])
return {
"daily_activity": df.resample('D', on='timestamp').size().to_dict(),
"hourly_pattern": df.groupby(df['timestamp'].dt.hour).size().to_dict()
}
def generate_report(self) -> str:
"""Generate comprehensive analysis report"""
if not self.logs:
return "No logs available for analysis"
stats = self.get_basic_stats()
temporal = self.temporal_analysis()
# Format top questions
top_questions = "\n".join(
f"- {q['question']}: {q['count']}"
for q in stats['most_common_questions']
)
# Build report sections separately
header = "Legal Assistant Usage Report\n----------------------------"
period = (f"Period: {self.logs[0]['timestamp']} - {self.logs[-1]['timestamp']}")
basic_stats = (
f"Total Interactions: {stats['total_interactions']}\n"
f"Unique Users: {stats['unique_users']}\n"
f"Average Response Length: {stats['avg_response_length']:.1f} chars"
)
kb_usage = (
"Knowledge Base Usage:\n"
f"- With context: {stats['knowledge_base_usage'].get('with_context', 0)}\n"
f"- Without context: {stats['knowledge_base_usage'].get('without_context', 0)}"
)
patterns = (
"Usage Patterns:\n"
f"- Daily Activity: {temporal['daily_activity']}\n"
f"- Hourly Distribution: {temporal['hourly_pattern']}"
)
# Combine all sections
report_sections = [
header,
period,
basic_stats,
"Top Questions:",
top_questions,
kb_usage,
patterns
]
# Join sections with double newlines
return "\n\n".join(report_sections)