File size: 20,923 Bytes
459b8b0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 |
from datetime import datetime
import json
from bson import ObjectId
import typing_extensions as typing
import google.generativeai as genai
from typing import List, Dict, Any
import numpy as np
from collections import defaultdict
from dotenv import load_dotenv
import os
import pymongo
from pymongo import MongoClient
load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_KEY")
class NovaScholarAnalytics:
def __init__(self, model_name: str = "gemini-1.5-flash"):
genai.configure(api_key=GEMINI_API_KEY)
self.model = genai.GenerativeModel(model_name)
def _preprocess_chat_histories(self, chat_histories: List[Dict]) -> List[Dict]:
# Code 2:
"""Preprocess chat histories to focus on relevant information."""
processed = []
for chat in chat_histories:
# Convert ObjectId to string if it's an ObjectId
user_id = str(chat["user_id"]["$oid"]) if isinstance(chat["user_id"], dict) and "$oid" in chat["user_id"] else str(chat["user_id"])
try:
processed_chat = {
"user_id": user_id,
"messages": [
{
"prompt": msg["prompt"],
"response": msg["response"]
}
for msg in chat["messages"]
]
}
processed.append(processed_chat)
print(f"Successfully processed chat for user: {user_id}")
except Exception as e:
print(f"Error processing chat for user: {user_id}")
print(f"Error details: {str(e)}")
continue
return processed
def _create_analytics_prompt(self, chat_histories: List[Dict], all_topics: List[str]) -> str:
"""Creates a structured prompt for Gemini to analyze chat histories."""
return f"""Analyze the provided student chat histories for a university course and generate concise, actionable analytics WITH EVIDENCE.
Context:
- Chat histories: {json.dumps(chat_histories, indent=2)}
- These are pre-class interactions between students and an AI tutor
- Topics covered: {', '.join(all_topics)}
Your task is to provide analytics with supporting evidence from the chat histories.
Output Format (strictly follow this JSON structure):
{{
"topic_wise_insights": [
{{
"topic": "<string>",
"struggling_percentage": <number between 0 and 1>,
"evidence": {{
"calculation": "Explain how struggling_percentage was calculated",
"supporting_messages": [
{{
"user_id": "<string>",
"message": "<string>",
"reasoning": "Why this message indicates struggling"
}}
]
}},
"key_issues": ["<string>"],
"key_misconceptions": ["<string>"],
"evidence_for_issues": [
{{
"issue": "<string>",
"supporting_messages": [
{{
"user_id": "<string>",
"message": "<string>"
}}
]
}}
]
}}
],
"ai_recommended_actions": [
{{
"action": "<string>",
"priority": "high|medium|low",
"reasoning": "<string>",
"evidence": {{
"supporting_messages": [
{{
"user_id": "<string>",
"message": "<string>",
"relevance": "Why this message supports the recommendation"
}}
],
"pattern_description": "Description of the pattern observed in chat histories"
}},
"expected_outcome": "<string>"
}}
],
"student_analytics": [
{{
"student_id": "<string>",
"engagement_metrics": {{
"participation_level": <number between 0 and 1>,
"concept_understanding": "strong|moderate|needs_improvement",
"question_quality": "advanced|intermediate|basic"
}},
"evidence": {{
"participation_calculation": "Explain how participation_level was calculated",
"understanding_evidence": [
{{
"message": "<string>",
"analysis": "Why this indicates their understanding level"
}}
],
"question_quality_evidence": [
{{
"question": "<string>",
"analysis": "Why this question is classified at this level"
}}
]
}},
"struggling_topics": ["<string>"],
"personalized_recommendation": "<string>"
}}
]
}}
Guidelines for Analysis:
1. For every insight, recommendation, or metric, provide specific evidence from the chat histories
2. Explain calculations (e.g., how struggling_percentage was derived)
3. Include relevant message excerpts that support each conclusion
4. For recommendations, show the pattern of student interactions that led to that recommendation
5. When analyzing question quality or understanding, provide reasoning for the classification
The response must adhere strictly to the above JSON structure, with all fields populated appropriately."""
def _validate_analytics_with_evidence(self, initial_analytics: Dict) -> Dict:
"""Validate the initial analytics by checking evidence."""
validation_prompt = f"""Review and validate the following analytics based on the provided evidence.
Analytics to validate: {json.dumps(initial_analytics, indent=2)}
For each section:
1. Verify if the evidence supports the conclusions
2. Check if calculations (percentages, metrics) are justified by the data
3. Validate if recommendations are supported by patterns in the chat history
Return a JSON with the same structure, but only include insights/recommendations that have strong supporting evidence.
For any removed items, include them in a separate "insufficient_evidence" section with explanation."""
try:
validation_response = self.model.generate_content(
validation_prompt,
generation_config=genai.GenerationConfig(
response_mime_type="application/json",
temperature=0.1
)
)
validated_analytics = json.loads(validation_response.text)
return validated_analytics
except Exception as e:
print(f"Error in validation: {str(e)}")
return initial_analytics
def _enrich_analytics(self, analytics: Dict) -> Dict:
"""Add derived insights and metrics to the validated analytics."""
try:
# Calculate class distribution
total_students = len(analytics.get("student_insights", []))
performance_distribution = defaultdict(int)
for student in analytics.get("student_insights", []):
metrics = student.get("engagement_metrics", {})
understanding = metrics.get("concept_understanding", "moderate")
if understanding == "strong":
performance_distribution["high_performers"] += 1
elif understanding == "needs_improvement":
performance_distribution["at_risk"] += 1
else:
performance_distribution["average_performers"] += 1
# Convert to percentages
class_distribution = {
level: count/total_students if total_students > 0 else 0
for level, count in performance_distribution.items()
}
# Calculate overall engagement
engagement_sum = sum(
student.get("engagement_metrics", {}).get("participation_level", 0)
for student in analytics.get("student_insights", [])
)
overall_engagement = engagement_sum / total_students if total_students > 0 else 0
# Identify critical topics (those with high struggling percentage)
critical_topics = [
topic["topic"]
for topic in analytics.get("topic_wise_insights", [])
if topic.get("struggling_percentage", 0) > 0.7 # 70% threshold
]
# Identify students needing intervention
immediate_attention = []
monitoring_required = []
for student in analytics.get("student_insights", []):
student_id = student.get("student_id")
metrics = student.get("engagement_metrics", {})
# Check for immediate attention needed
if (metrics.get("concept_understanding") == "needs_improvement" or
metrics.get("participation_level", 0) < 0.3 or # Less than 30% participation
len(student.get("struggling_topics", [])) > 2): # Struggling with more than 2 topics
immediate_attention.append(student_id)
# Check for monitoring
elif (metrics.get("concept_understanding") == "moderate" or
metrics.get("participation_level", 0) < 0.5): # Less than 50% participation
monitoring_required.append(student_id)
# Add enriched data to analytics
analytics["course_health"] = {
"overall_engagement": overall_engagement,
"critical_topics": critical_topics,
"class_distribution": class_distribution
}
analytics["intervention_metrics"] = {
"immediate_attention_needed": immediate_attention,
"monitoring_required": monitoring_required
}
# Add evidence for enriched metrics
analytics["course_health"]["evidence"] = {
"engagement_calculation": f"Calculated from average participation level of {total_students} students",
"critical_topics_criteria": "Topics where over 70% of students are struggling",
"distribution_calculation": "Based on concept understanding levels from student metrics"
}
analytics["intervention_metrics"]["evidence"] = {
"immediate_attention_criteria": "Students with low understanding, participation < 30%, or >2 struggling topics",
"monitoring_criteria": "Students with moderate understanding or participation < 50%"
}
return analytics
except Exception as e:
print(f"Error enriching analytics: {str(e)}")
return analytics # Return original analytics if enrichment fails
def generate_analytics(self, chat_histories: List[Dict], all_topics: List[str]) -> Dict:
"""Main method to generate analytics with evidence-based validation."""
try:
if not chat_histories or not all_topics:
print("Missing required input data")
return self._fallback_analytics()
try:
processed_histories = self._preprocess_chat_histories(chat_histories)
print("Successfully preprocessed chat histories")
except Exception as preprocess_error:
print(f"Error in preprocessing: {str(preprocess_error)}")
return self._fallback_analytics()
try:
prompt = self._create_analytics_prompt(processed_histories, all_topics)
print("Successfully created prompt")
print("Prompt preview:", prompt[:200] + "...") # Print first 200 chars
except Exception as prompt_error:
print(f"Error in prompt creation: {str(prompt_error)}")
return self._fallback_analytics()
# Generate initial analytics with evidence
# prompt = self._create_analytics_prompt(chat_histories, all_topics)
response = self.model.generate_content(
prompt,
generation_config=genai.GenerationConfig(
response_mime_type="application/json",
temperature=0.15
)
)
print(response.text)
if not response.text:
print("Empty response from Gemini")
return self._fallback_analytics()
# Parse initial analytics
# initial_analytics = self._process_gemini_response(response.text)
initial_analytics2 = json.loads(response.text)
print("Initial analytics:", initial_analytics2)
# print("Initial analytics type:", type(initial_analytics2))
# print("Moving to validation...")
# Validate analytics using evidence
validated_analytics = self._validate_analytics_with_evidence(initial_analytics2)
# # Enrich with additional metrics
final_analytics = self._enrich_analytics(validated_analytics)
return final_analytics
except Exception as e:
print(f"Error generating analytics: {str(e)}")
return self._fallback_analytics()
def _fallback_analytics(self) -> Dict:
"""Provide fallback analytics with explanation."""
return {
"topic_insights": [],
"student_insights": [],
"recommended_actions": [
{
"action": "Review analytics generation process",
"priority": "high",
"target_group": "system_administrators",
"reasoning": "Analytics generation failed",
"expected_impact": "Restore analytics functionality",
"evidence": {
"error": "Analytics generation failed to complete"
}
}
],
"course_health": {
"overall_engagement": 0,
"critical_topics": [],
"class_distribution": {
"high_performers": 0,
"average_performers": 0,
"at_risk": 0
}
},
"intervention_metrics": {
"immediate_attention_needed": [],
"monitoring_required": []
}
}
def _process_gemini_response(self, response: str) -> Dict:
print("Entered here")
try:
analytics = json.loads(response, object_hook=json_serializer)
if not isinstance(analytics, dict):
raise ValueError("Invalid response format")
return analytics
except Exception as e:
print(f"Error processing Gemini response: {str(e)}")
return self._fallback_analytics()
load_dotenv()
MONGODB_URI = os.getenv("MONGO_URI")
from file_upload_vectorize import model
import streamlit as st
def extract_topics_from_materials(session_id):
"""Extract topics from pre-class materials"""
materials = resources_collection.find({"session_id": session_id})
texts = ""
if materials:
for material in materials:
if 'text_content' in material:
text = material['text_content']
texts += text + "\n"
else:
st.warning("No text content found in the material.")
return
else:
st.error("No pre-class materials found for this session.")
return
if texts:
context_prompt = f"""
Task: Extract Comprehensive Topics in a List Format
You are tasked with analyzing the provided text content and extracting a detailed, flat list of topics.
Instructions:
Identify All Topics: Extract a comprehensive list of all topics, subtopics, and indirect topics present in the provided text content. This list should include:
Overarching themes
Main topics
Subtopics and their sub-subtopics
Indirectly related topics
Flat List Format: Provide a flat list where each item is a topic. Ensure topics at all levels (overarching, main, sub, sub-sub, indirect) are represented as individual entries in the list.
Be Exhaustive: Ensure the response captures every topic, subtopic, and indirectly related concept comprehensively.
Output Requirements:
Use this structure:
{{
"topics": [
"Topic 1",
"Topic 2",
"Topic 3",
...
]
}}
Do Not Include: Do not include backticks, hierarchical structures, or the word 'json' in your response.
Content to Analyze:
{texts}
"""
try:
# response = model.generate_content(context_prompt, generation_config=genai.GenerationConfig(response_mime_type="application/json", response_schema=list[Topics]))
response = model.generate_content(context_prompt, generation_config=genai.GenerationConfig(temperature=0.3))
if not response or not response.text:
st.error("Error extracting topics from materials.")
return
topics = response.text
return topics
except Exception as e:
st.error(f"Error extracting topics: {str(e)}")
return None
else:
st.error("No text content found in the pre-class materials.")
return None
def get_chat_history(user_id, session_id):
query = {
"user_id": ObjectId(user_id),
"session_id": session_id,
"timestamp": {"$lte": datetime.utcnow()}
}
result = chat_history_collection.find(query)
return list(result)
def json_serializer(obj):
if isinstance(obj, ObjectId):
return str(obj)
raise TypeError(f"Type {type(obj)} not serializable")
if __name__ == "__main__":
client = MongoClient(MONGODB_URI)
db = client["novascholar_db"]
chat_history_collection = db["chat_history"]
resources_collection = db["resources"]
session_id = "S104"
# Connect to MongoDB
user_ids = chat_history_collection.distinct("user_id", {"session_id": session_id})
# Debug print 2: Check user_ids
print("Found user_ids:", user_ids)
all_chat_histories = []
for user_id in user_ids:
result = get_chat_history(user_id, session_id)
# Debug print 3: Check each chat history result
print(f"Chat history for user {user_id}:", "Found" if result else "Not found")
if result:
for record in result:
chat_history = {
"user_id": record["user_id"], # Convert ObjectId to string
"session_id": record["session_id"],
"messages": record["messages"]
}
all_chat_histories.append(chat_history)
print(all_chat_histories)
# Export all chat histories to a JSON file
# Path: sample_files/chat_histories.json
# with open("sample_files/all_chat_histories3.json", "w") as file:
# json.dump(all_chat_histories, file, indent=2)
# Debug print 4: Check chat histories
print("Total chat histories collected:", len(all_chat_histories))
# Extract topics with debug print
# topics = extract_topics_from_materials(session_id)
# # Export extracted topics to a JSON file
# with open("sample_files/extracted_topics.json", "w") as file:
# json.dump(topics, file, indent=2)
# Load extracted topics from JSON file
with open("sample_files/extracted_topics.json", "r") as file:
topics = json.load(file)
# Debug print 5: Check topics
print("Extracted topics:", topics)
# Generate analytics
analytics_generator = NovaScholarAnalytics()
analytics = analytics_generator.generate_analytics(all_chat_histories, topics)
# Debug print 6: Check generated analytics
print("Generated Analytics:", analytics)
|