|
from datetime import datetime |
|
import json |
|
from bson import ObjectId |
|
import typing_extensions as typing |
|
import google.generativeai as genai |
|
from typing import List, Dict, Any |
|
import numpy as np |
|
from collections import defaultdict |
|
|
|
from dotenv import load_dotenv |
|
import os |
|
import pymongo |
|
from pymongo import MongoClient |
|
|
|
load_dotenv() |
|
GEMINI_API_KEY = os.getenv("GEMINI_KEY") |
|
|
|
class NovaScholarAnalytics: |
|
def __init__(self, model_name: str = "gemini-1.5-flash"): |
|
genai.configure(api_key=GEMINI_API_KEY) |
|
self.model = genai.GenerativeModel(model_name) |
|
|
|
def _preprocess_chat_histories(self, chat_histories: List[Dict]) -> List[Dict]: |
|
|
|
"""Preprocess chat histories to focus on relevant information.""" |
|
processed = [] |
|
|
|
for chat in chat_histories: |
|
|
|
user_id = str(chat["user_id"]["$oid"]) if isinstance(chat["user_id"], dict) and "$oid" in chat["user_id"] else str(chat["user_id"]) |
|
|
|
try: |
|
processed_chat = { |
|
"user_id": user_id, |
|
"messages": [ |
|
{ |
|
"prompt": msg["prompt"], |
|
"response": msg["response"] |
|
} |
|
for msg in chat["messages"] |
|
] |
|
} |
|
processed.append(processed_chat) |
|
print(f"Successfully processed chat for user: {user_id}") |
|
except Exception as e: |
|
print(f"Error processing chat for user: {user_id}") |
|
print(f"Error details: {str(e)}") |
|
continue |
|
|
|
return processed |
|
|
|
def _create_analytics_prompt(self, chat_histories: List[Dict], all_topics: List[str]) -> str: |
|
"""Creates a structured prompt for Gemini to analyze chat histories.""" |
|
return f"""Analyze the provided student chat histories for a university course and generate concise, actionable analytics WITH EVIDENCE. |
|
|
|
Context: |
|
- Chat histories: {json.dumps(chat_histories, indent=2)} |
|
- These are pre-class interactions between students and an AI tutor |
|
- Topics covered: {', '.join(all_topics)} |
|
|
|
Your task is to provide analytics with supporting evidence from the chat histories. |
|
|
|
Output Format (strictly follow this JSON structure): |
|
{{ |
|
"topic_wise_insights": [ |
|
{{ |
|
"topic": "<string>", |
|
"struggling_percentage": <number between 0 and 1>, |
|
"evidence": {{ |
|
"calculation": "Explain how struggling_percentage was calculated", |
|
"supporting_messages": [ |
|
{{ |
|
"user_id": "<string>", |
|
"message": "<string>", |
|
"reasoning": "Why this message indicates struggling" |
|
}} |
|
] |
|
}}, |
|
"key_issues": ["<string>"], |
|
"key_misconceptions": ["<string>"], |
|
"evidence_for_issues": [ |
|
{{ |
|
"issue": "<string>", |
|
"supporting_messages": [ |
|
{{ |
|
"user_id": "<string>", |
|
"message": "<string>" |
|
}} |
|
] |
|
}} |
|
] |
|
}} |
|
], |
|
"ai_recommended_actions": [ |
|
{{ |
|
"action": "<string>", |
|
"priority": "high|medium|low", |
|
"reasoning": "<string>", |
|
"evidence": {{ |
|
"supporting_messages": [ |
|
{{ |
|
"user_id": "<string>", |
|
"message": "<string>", |
|
"relevance": "Why this message supports the recommendation" |
|
}} |
|
], |
|
"pattern_description": "Description of the pattern observed in chat histories" |
|
}}, |
|
"expected_outcome": "<string>" |
|
}} |
|
], |
|
"student_analytics": [ |
|
{{ |
|
"student_id": "<string>", |
|
"engagement_metrics": {{ |
|
"participation_level": <number between 0 and 1>, |
|
"concept_understanding": "strong|moderate|needs_improvement", |
|
"question_quality": "advanced|intermediate|basic" |
|
}}, |
|
"evidence": {{ |
|
"participation_calculation": "Explain how participation_level was calculated", |
|
"understanding_evidence": [ |
|
{{ |
|
"message": "<string>", |
|
"analysis": "Why this indicates their understanding level" |
|
}} |
|
], |
|
"question_quality_evidence": [ |
|
{{ |
|
"question": "<string>", |
|
"analysis": "Why this question is classified at this level" |
|
}} |
|
] |
|
}}, |
|
"struggling_topics": ["<string>"], |
|
"personalized_recommendation": "<string>" |
|
}} |
|
] |
|
}} |
|
|
|
Guidelines for Analysis: |
|
1. For every insight, recommendation, or metric, provide specific evidence from the chat histories |
|
2. Explain calculations (e.g., how struggling_percentage was derived) |
|
3. Include relevant message excerpts that support each conclusion |
|
4. For recommendations, show the pattern of student interactions that led to that recommendation |
|
5. When analyzing question quality or understanding, provide reasoning for the classification |
|
|
|
The response must adhere strictly to the above JSON structure, with all fields populated appropriately.""" |
|
|
|
def _validate_analytics_with_evidence(self, initial_analytics: Dict) -> Dict: |
|
"""Validate the initial analytics by checking evidence.""" |
|
validation_prompt = f"""Review and validate the following analytics based on the provided evidence. |
|
|
|
Analytics to validate: {json.dumps(initial_analytics, indent=2)} |
|
|
|
For each section: |
|
1. Verify if the evidence supports the conclusions |
|
2. Check if calculations (percentages, metrics) are justified by the data |
|
3. Validate if recommendations are supported by patterns in the chat history |
|
|
|
Return a JSON with the same structure, but only include insights/recommendations that have strong supporting evidence. |
|
For any removed items, include them in a separate "insufficient_evidence" section with explanation.""" |
|
|
|
try: |
|
validation_response = self.model.generate_content( |
|
validation_prompt, |
|
generation_config=genai.GenerationConfig( |
|
response_mime_type="application/json", |
|
temperature=0.1 |
|
) |
|
) |
|
|
|
validated_analytics = json.loads(validation_response.text) |
|
return validated_analytics |
|
|
|
except Exception as e: |
|
print(f"Error in validation: {str(e)}") |
|
return initial_analytics |
|
|
|
def _enrich_analytics(self, analytics: Dict) -> Dict: |
|
"""Add derived insights and metrics to the validated analytics.""" |
|
try: |
|
|
|
total_students = len(analytics.get("student_insights", [])) |
|
performance_distribution = defaultdict(int) |
|
|
|
for student in analytics.get("student_insights", []): |
|
metrics = student.get("engagement_metrics", {}) |
|
understanding = metrics.get("concept_understanding", "moderate") |
|
|
|
if understanding == "strong": |
|
performance_distribution["high_performers"] += 1 |
|
elif understanding == "needs_improvement": |
|
performance_distribution["at_risk"] += 1 |
|
else: |
|
performance_distribution["average_performers"] += 1 |
|
|
|
|
|
class_distribution = { |
|
level: count/total_students if total_students > 0 else 0 |
|
for level, count in performance_distribution.items() |
|
} |
|
|
|
|
|
engagement_sum = sum( |
|
student.get("engagement_metrics", {}).get("participation_level", 0) |
|
for student in analytics.get("student_insights", []) |
|
) |
|
overall_engagement = engagement_sum / total_students if total_students > 0 else 0 |
|
|
|
|
|
critical_topics = [ |
|
topic["topic"] |
|
for topic in analytics.get("topic_wise_insights", []) |
|
if topic.get("struggling_percentage", 0) > 0.7 |
|
] |
|
|
|
|
|
immediate_attention = [] |
|
monitoring_required = [] |
|
|
|
for student in analytics.get("student_insights", []): |
|
student_id = student.get("student_id") |
|
metrics = student.get("engagement_metrics", {}) |
|
|
|
|
|
if (metrics.get("concept_understanding") == "needs_improvement" or |
|
metrics.get("participation_level", 0) < 0.3 or |
|
len(student.get("struggling_topics", [])) > 2): |
|
immediate_attention.append(student_id) |
|
|
|
elif (metrics.get("concept_understanding") == "moderate" or |
|
metrics.get("participation_level", 0) < 0.5): |
|
monitoring_required.append(student_id) |
|
|
|
|
|
analytics["course_health"] = { |
|
"overall_engagement": overall_engagement, |
|
"critical_topics": critical_topics, |
|
"class_distribution": class_distribution |
|
} |
|
|
|
analytics["intervention_metrics"] = { |
|
"immediate_attention_needed": immediate_attention, |
|
"monitoring_required": monitoring_required |
|
} |
|
|
|
|
|
analytics["course_health"]["evidence"] = { |
|
"engagement_calculation": f"Calculated from average participation level of {total_students} students", |
|
"critical_topics_criteria": "Topics where over 70% of students are struggling", |
|
"distribution_calculation": "Based on concept understanding levels from student metrics" |
|
} |
|
|
|
analytics["intervention_metrics"]["evidence"] = { |
|
"immediate_attention_criteria": "Students with low understanding, participation < 30%, or >2 struggling topics", |
|
"monitoring_criteria": "Students with moderate understanding or participation < 50%" |
|
} |
|
|
|
return analytics |
|
|
|
except Exception as e: |
|
print(f"Error enriching analytics: {str(e)}") |
|
return analytics |
|
|
|
def generate_analytics(self, chat_histories: List[Dict], all_topics: List[str]) -> Dict: |
|
"""Main method to generate analytics with evidence-based validation.""" |
|
try: |
|
if not chat_histories or not all_topics: |
|
print("Missing required input data") |
|
return self._fallback_analytics() |
|
|
|
try: |
|
processed_histories = self._preprocess_chat_histories(chat_histories) |
|
print("Successfully preprocessed chat histories") |
|
except Exception as preprocess_error: |
|
print(f"Error in preprocessing: {str(preprocess_error)}") |
|
return self._fallback_analytics() |
|
|
|
try: |
|
prompt = self._create_analytics_prompt(processed_histories, all_topics) |
|
print("Successfully created prompt") |
|
print("Prompt preview:", prompt[:200] + "...") |
|
except Exception as prompt_error: |
|
print(f"Error in prompt creation: {str(prompt_error)}") |
|
return self._fallback_analytics() |
|
|
|
|
|
|
|
response = self.model.generate_content( |
|
prompt, |
|
generation_config=genai.GenerationConfig( |
|
response_mime_type="application/json", |
|
temperature=0.15 |
|
) |
|
) |
|
print(response.text) |
|
|
|
if not response.text: |
|
print("Empty response from Gemini") |
|
return self._fallback_analytics() |
|
|
|
|
|
|
|
initial_analytics2 = json.loads(response.text) |
|
print("Initial analytics:", initial_analytics2) |
|
|
|
|
|
|
|
|
|
validated_analytics = self._validate_analytics_with_evidence(initial_analytics2) |
|
|
|
|
|
final_analytics = self._enrich_analytics(validated_analytics) |
|
|
|
return final_analytics |
|
|
|
except Exception as e: |
|
print(f"Error generating analytics: {str(e)}") |
|
return self._fallback_analytics() |
|
|
|
def _fallback_analytics(self) -> Dict: |
|
"""Provide fallback analytics with explanation.""" |
|
return { |
|
"topic_insights": [], |
|
"student_insights": [], |
|
"recommended_actions": [ |
|
{ |
|
"action": "Review analytics generation process", |
|
"priority": "high", |
|
"target_group": "system_administrators", |
|
"reasoning": "Analytics generation failed", |
|
"expected_impact": "Restore analytics functionality", |
|
"evidence": { |
|
"error": "Analytics generation failed to complete" |
|
} |
|
} |
|
], |
|
"course_health": { |
|
"overall_engagement": 0, |
|
"critical_topics": [], |
|
"class_distribution": { |
|
"high_performers": 0, |
|
"average_performers": 0, |
|
"at_risk": 0 |
|
} |
|
}, |
|
"intervention_metrics": { |
|
"immediate_attention_needed": [], |
|
"monitoring_required": [] |
|
} |
|
} |
|
def _process_gemini_response(self, response: str) -> Dict: |
|
print("Entered here") |
|
try: |
|
analytics = json.loads(response, object_hook=json_serializer) |
|
if not isinstance(analytics, dict): |
|
raise ValueError("Invalid response format") |
|
return analytics |
|
except Exception as e: |
|
print(f"Error processing Gemini response: {str(e)}") |
|
return self._fallback_analytics() |
|
|
|
load_dotenv() |
|
MONGODB_URI = os.getenv("MONGO_URI") |
|
from file_upload_vectorize import model |
|
import streamlit as st |
|
|
|
def extract_topics_from_materials(session_id): |
|
"""Extract topics from pre-class materials""" |
|
materials = resources_collection.find({"session_id": session_id}) |
|
texts = "" |
|
if materials: |
|
for material in materials: |
|
if 'text_content' in material: |
|
text = material['text_content'] |
|
texts += text + "\n" |
|
else: |
|
st.warning("No text content found in the material.") |
|
return |
|
else: |
|
st.error("No pre-class materials found for this session.") |
|
return |
|
|
|
if texts: |
|
context_prompt = f""" |
|
Task: Extract Comprehensive Topics in a List Format |
|
You are tasked with analyzing the provided text content and extracting a detailed, flat list of topics. |
|
|
|
Instructions: |
|
Identify All Topics: Extract a comprehensive list of all topics, subtopics, and indirect topics present in the provided text content. This list should include: |
|
|
|
Overarching themes |
|
Main topics |
|
Subtopics and their sub-subtopics |
|
Indirectly related topics |
|
Flat List Format: Provide a flat list where each item is a topic. Ensure topics at all levels (overarching, main, sub, sub-sub, indirect) are represented as individual entries in the list. |
|
|
|
Be Exhaustive: Ensure the response captures every topic, subtopic, and indirectly related concept comprehensively. |
|
|
|
Output Requirements: |
|
Use this structure: |
|
{{ |
|
"topics": [ |
|
"Topic 1", |
|
"Topic 2", |
|
"Topic 3", |
|
... |
|
] |
|
}} |
|
Do Not Include: Do not include backticks, hierarchical structures, or the word 'json' in your response. |
|
|
|
Content to Analyze: |
|
{texts} |
|
""" |
|
try: |
|
|
|
response = model.generate_content(context_prompt, generation_config=genai.GenerationConfig(temperature=0.3)) |
|
if not response or not response.text: |
|
st.error("Error extracting topics from materials.") |
|
return |
|
|
|
topics = response.text |
|
return topics |
|
except Exception as e: |
|
st.error(f"Error extracting topics: {str(e)}") |
|
return None |
|
else: |
|
st.error("No text content found in the pre-class materials.") |
|
return None |
|
|
|
|
|
def get_chat_history(user_id, session_id): |
|
query = { |
|
"user_id": ObjectId(user_id), |
|
"session_id": session_id, |
|
"timestamp": {"$lte": datetime.utcnow()} |
|
} |
|
result = chat_history_collection.find(query) |
|
return list(result) |
|
|
|
def json_serializer(obj): |
|
if isinstance(obj, ObjectId): |
|
return str(obj) |
|
raise TypeError(f"Type {type(obj)} not serializable") |
|
|
|
if __name__ == "__main__": |
|
client = MongoClient(MONGODB_URI) |
|
db = client["novascholar_db"] |
|
chat_history_collection = db["chat_history"] |
|
resources_collection = db["resources"] |
|
session_id = "S104" |
|
|
|
user_ids = chat_history_collection.distinct("user_id", {"session_id": session_id}) |
|
|
|
print("Found user_ids:", user_ids) |
|
|
|
all_chat_histories = [] |
|
for user_id in user_ids: |
|
result = get_chat_history(user_id, session_id) |
|
|
|
print(f"Chat history for user {user_id}:", "Found" if result else "Not found") |
|
if result: |
|
for record in result: |
|
chat_history = { |
|
"user_id": record["user_id"], |
|
"session_id": record["session_id"], |
|
"messages": record["messages"] |
|
} |
|
all_chat_histories.append(chat_history) |
|
|
|
print(all_chat_histories) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Total chat histories collected:", len(all_chat_histories)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with open("sample_files/extracted_topics.json", "r") as file: |
|
topics = json.load(file) |
|
|
|
print("Extracted topics:", topics) |
|
|
|
|
|
|
|
analytics_generator = NovaScholarAnalytics() |
|
analytics = analytics_generator.generate_analytics(all_chat_histories, topics) |
|
|
|
print("Generated Analytics:", analytics) |
|
|