Spaces:

SPJIMR-Internship
/

SPJIMR_FlipClassroom_RCopilot_ResearchInternship

Running

SPJIMR_FlipClassroom_RCopilot_ResearchInternship / pre_class_analytics3.py

Harshal Vhatkar

integrate new features

459b8b0 5 months ago

20.9 kB

	from datetime import datetime
	import json
	from bson import ObjectId
	import typing_extensions as typing
	import google.generativeai as genai
	from typing import List, Dict, Any
	import numpy as np
	from collections import defaultdict

	from dotenv import load_dotenv
	import os
	import pymongo
	from pymongo import MongoClient

	load_dotenv()
	GEMINI_API_KEY = os.getenv("GEMINI_KEY")

	class NovaScholarAnalytics:
	def __init__(self, model_name: str = "gemini-1.5-flash"):
	genai.configure(api_key=GEMINI_API_KEY)
	self.model = genai.GenerativeModel(model_name)

	def _preprocess_chat_histories(self, chat_histories: List[Dict]) -> List[Dict]:
	# Code 2:
	"""Preprocess chat histories to focus on relevant information."""
	processed = []

	for chat in chat_histories:
	# Convert ObjectId to string if it's an ObjectId
	user_id = str(chat["user_id"]["$oid"]) if isinstance(chat["user_id"], dict) and "$oid" in chat["user_id"] else str(chat["user_id"])

	try:
	processed_chat = {
	"user_id": user_id,
	"messages": [
	{
	"prompt": msg["prompt"],
	"response": msg["response"]
	}
	for msg in chat["messages"]
	]
	}
	processed.append(processed_chat)
	print(f"Successfully processed chat for user: {user_id}")
	except Exception as e:
	print(f"Error processing chat for user: {user_id}")
	print(f"Error details: {str(e)}")
	continue

	return processed

	def _create_analytics_prompt(self, chat_histories: List[Dict], all_topics: List[str]) -> str:
	"""Creates a structured prompt for Gemini to analyze chat histories."""
	return f"""Analyze the provided student chat histories for a university course and generate concise, actionable analytics WITH EVIDENCE.

	Context:
	- Chat histories: {json.dumps(chat_histories, indent=2)}
	- These are pre-class interactions between students and an AI tutor
	- Topics covered: {', '.join(all_topics)}

	Your task is to provide analytics with supporting evidence from the chat histories.

	Output Format (strictly follow this JSON structure):
	{{
	"topic_wise_insights": [
	{{
	"topic": "<string>",
	"struggling_percentage": <number between 0 and 1>,
	"evidence": {{
	"calculation": "Explain how struggling_percentage was calculated",
	"supporting_messages": [
	{{
	"user_id": "<string>",
	"message": "<string>",
	"reasoning": "Why this message indicates struggling"
	}}
	]
	}},
	"key_issues": ["<string>"],
	"key_misconceptions": ["<string>"],
	"evidence_for_issues": [
	{{
	"issue": "<string>",
	"supporting_messages": [
	{{
	"user_id": "<string>",
	"message": "<string>"
	}}
	]
	}}
	]
	}}
	],
	"ai_recommended_actions": [
	{{
	"action": "<string>",
	"priority": "high\|medium\|low",
	"reasoning": "<string>",
	"evidence": {{
	"supporting_messages": [
	{{
	"user_id": "<string>",
	"message": "<string>",
	"relevance": "Why this message supports the recommendation"
	}}
	],
	"pattern_description": "Description of the pattern observed in chat histories"
	}},
	"expected_outcome": "<string>"
	}}
	],
	"student_analytics": [
	{{
	"student_id": "<string>",
	"engagement_metrics": {{
	"participation_level": <number between 0 and 1>,
	"concept_understanding": "strong\|moderate\|needs_improvement",
	"question_quality": "advanced\|intermediate\|basic"
	}},
	"evidence": {{
	"participation_calculation": "Explain how participation_level was calculated",
	"understanding_evidence": [
	{{
	"message": "<string>",
	"analysis": "Why this indicates their understanding level"
	}}
	],
	"question_quality_evidence": [
	{{
	"question": "<string>",
	"analysis": "Why this question is classified at this level"
	}}
	]
	}},
	"struggling_topics": ["<string>"],
	"personalized_recommendation": "<string>"
	}}
	]
	}}

	Guidelines for Analysis:
	1. For every insight, recommendation, or metric, provide specific evidence from the chat histories
	2. Explain calculations (e.g., how struggling_percentage was derived)
	3. Include relevant message excerpts that support each conclusion
	4. For recommendations, show the pattern of student interactions that led to that recommendation
	5. When analyzing question quality or understanding, provide reasoning for the classification

	The response must adhere strictly to the above JSON structure, with all fields populated appropriately."""

	def _validate_analytics_with_evidence(self, initial_analytics: Dict) -> Dict:
	"""Validate the initial analytics by checking evidence."""
	validation_prompt = f"""Review and validate the following analytics based on the provided evidence.

	Analytics to validate: {json.dumps(initial_analytics, indent=2)}

	For each section:
	1. Verify if the evidence supports the conclusions
	2. Check if calculations (percentages, metrics) are justified by the data
	3. Validate if recommendations are supported by patterns in the chat history

	Return a JSON with the same structure, but only include insights/recommendations that have strong supporting evidence.
	For any removed items, include them in a separate "insufficient_evidence" section with explanation."""

	try:
	validation_response = self.model.generate_content(
	validation_prompt,
	generation_config=genai.GenerationConfig(
	response_mime_type="application/json",
	temperature=0.1
	)
	)

	validated_analytics = json.loads(validation_response.text)
	return validated_analytics

	except Exception as e:
	print(f"Error in validation: {str(e)}")
	return initial_analytics

	def _enrich_analytics(self, analytics: Dict) -> Dict:
	"""Add derived insights and metrics to the validated analytics."""
	try:
	# Calculate class distribution
	total_students = len(analytics.get("student_insights", []))
	performance_distribution = defaultdict(int)

	for student in analytics.get("student_insights", []):
	metrics = student.get("engagement_metrics", {})
	understanding = metrics.get("concept_understanding", "moderate")

	if understanding == "strong":
	performance_distribution["high_performers"] += 1
	elif understanding == "needs_improvement":
	performance_distribution["at_risk"] += 1
	else:
	performance_distribution["average_performers"] += 1

	# Convert to percentages
	class_distribution = {
	level: count/total_students if total_students > 0 else 0
	for level, count in performance_distribution.items()
	}

	# Calculate overall engagement
	engagement_sum = sum(
	student.get("engagement_metrics", {}).get("participation_level", 0)
	for student in analytics.get("student_insights", [])
	)
	overall_engagement = engagement_sum / total_students if total_students > 0 else 0

	# Identify critical topics (those with high struggling percentage)
	critical_topics = [
	topic["topic"]
	for topic in analytics.get("topic_wise_insights", [])
	if topic.get("struggling_percentage", 0) > 0.7 # 70% threshold
	]

	# Identify students needing intervention
	immediate_attention = []
	monitoring_required = []

	for student in analytics.get("student_insights", []):
	student_id = student.get("student_id")
	metrics = student.get("engagement_metrics", {})

	# Check for immediate attention needed
	if (metrics.get("concept_understanding") == "needs_improvement" or
	metrics.get("participation_level", 0) < 0.3 or # Less than 30% participation
	len(student.get("struggling_topics", [])) > 2): # Struggling with more than 2 topics
	immediate_attention.append(student_id)
	# Check for monitoring
	elif (metrics.get("concept_understanding") == "moderate" or
	metrics.get("participation_level", 0) < 0.5): # Less than 50% participation
	monitoring_required.append(student_id)

	# Add enriched data to analytics
	analytics["course_health"] = {
	"overall_engagement": overall_engagement,
	"critical_topics": critical_topics,
	"class_distribution": class_distribution
	}

	analytics["intervention_metrics"] = {
	"immediate_attention_needed": immediate_attention,
	"monitoring_required": monitoring_required
	}

	# Add evidence for enriched metrics
	analytics["course_health"]["evidence"] = {
	"engagement_calculation": f"Calculated from average participation level of {total_students} students",
	"critical_topics_criteria": "Topics where over 70% of students are struggling",
	"distribution_calculation": "Based on concept understanding levels from student metrics"
	}

	analytics["intervention_metrics"]["evidence"] = {
	"immediate_attention_criteria": "Students with low understanding, participation < 30%, or >2 struggling topics",
	"monitoring_criteria": "Students with moderate understanding or participation < 50%"
	}

	return analytics

	except Exception as e:
	print(f"Error enriching analytics: {str(e)}")
	return analytics # Return original analytics if enrichment fails

	def generate_analytics(self, chat_histories: List[Dict], all_topics: List[str]) -> Dict:
	"""Main method to generate analytics with evidence-based validation."""
	try:
	if not chat_histories or not all_topics:
	print("Missing required input data")
	return self._fallback_analytics()

	try:
	processed_histories = self._preprocess_chat_histories(chat_histories)
	print("Successfully preprocessed chat histories")
	except Exception as preprocess_error:
	print(f"Error in preprocessing: {str(preprocess_error)}")
	return self._fallback_analytics()

	try:
	prompt = self._create_analytics_prompt(processed_histories, all_topics)
	print("Successfully created prompt")
	print("Prompt preview:", prompt[:200] + "...") # Print first 200 chars
	except Exception as prompt_error:
	print(f"Error in prompt creation: {str(prompt_error)}")
	return self._fallback_analytics()

	# Generate initial analytics with evidence
	# prompt = self._create_analytics_prompt(chat_histories, all_topics)
	response = self.model.generate_content(
	prompt,
	generation_config=genai.GenerationConfig(
	response_mime_type="application/json",
	temperature=0.15
	)
	)
	print(response.text)

	if not response.text:
	print("Empty response from Gemini")
	return self._fallback_analytics()

	# Parse initial analytics
	# initial_analytics = self._process_gemini_response(response.text)
	initial_analytics2 = json.loads(response.text)
	print("Initial analytics:", initial_analytics2)
	# print("Initial analytics type:", type(initial_analytics2))
	# print("Moving to validation...")

	# Validate analytics using evidence
	validated_analytics = self._validate_analytics_with_evidence(initial_analytics2)

	# # Enrich with additional metrics
	final_analytics = self._enrich_analytics(validated_analytics)

	return final_analytics

	except Exception as e:
	print(f"Error generating analytics: {str(e)}")
	return self._fallback_analytics()

	def _fallback_analytics(self) -> Dict:
	"""Provide fallback analytics with explanation."""
	return {
	"topic_insights": [],
	"student_insights": [],
	"recommended_actions": [
	{
	"action": "Review analytics generation process",
	"priority": "high",
	"target_group": "system_administrators",
	"reasoning": "Analytics generation failed",
	"expected_impact": "Restore analytics functionality",
	"evidence": {
	"error": "Analytics generation failed to complete"
	}
	}
	],
	"course_health": {
	"overall_engagement": 0,
	"critical_topics": [],
	"class_distribution": {
	"high_performers": 0,
	"average_performers": 0,
	"at_risk": 0
	}
	},
	"intervention_metrics": {
	"immediate_attention_needed": [],
	"monitoring_required": []
	}
	}
	def _process_gemini_response(self, response: str) -> Dict:
	print("Entered here")
	try:
	analytics = json.loads(response, object_hook=json_serializer)
	if not isinstance(analytics, dict):
	raise ValueError("Invalid response format")
	return analytics
	except Exception as e:
	print(f"Error processing Gemini response: {str(e)}")
	return self._fallback_analytics()

	load_dotenv()
	MONGODB_URI = os.getenv("MONGO_URI")
	from file_upload_vectorize import model
	import streamlit as st

	def extract_topics_from_materials(session_id):
	"""Extract topics from pre-class materials"""
	materials = resources_collection.find({"session_id": session_id})
	texts = ""
	if materials:
	for material in materials:
	if 'text_content' in material:
	text = material['text_content']
	texts += text + "\n"
	else:
	st.warning("No text content found in the material.")
	return
	else:
	st.error("No pre-class materials found for this session.")
	return

	if texts:
	context_prompt = f"""
	Task: Extract Comprehensive Topics in a List Format
	You are tasked with analyzing the provided text content and extracting a detailed, flat list of topics.

	Instructions:
	Identify All Topics: Extract a comprehensive list of all topics, subtopics, and indirect topics present in the provided text content. This list should include:

	Overarching themes
	Main topics
	Subtopics and their sub-subtopics
	Indirectly related topics
	Flat List Format: Provide a flat list where each item is a topic. Ensure topics at all levels (overarching, main, sub, sub-sub, indirect) are represented as individual entries in the list.

	Be Exhaustive: Ensure the response captures every topic, subtopic, and indirectly related concept comprehensively.

	Output Requirements:
	Use this structure:
	{{
	"topics": [
	"Topic 1",
	"Topic 2",
	"Topic 3",
	...
	]
	}}
	Do Not Include: Do not include backticks, hierarchical structures, or the word 'json' in your response.

	Content to Analyze:
	{texts}
	"""
	try:
	# response = model.generate_content(context_prompt, generation_config=genai.GenerationConfig(response_mime_type="application/json", response_schema=list[Topics]))
	response = model.generate_content(context_prompt, generation_config=genai.GenerationConfig(temperature=0.3))
	if not response or not response.text:
	st.error("Error extracting topics from materials.")
	return

	topics = response.text
	return topics
	except Exception as e:
	st.error(f"Error extracting topics: {str(e)}")
	return None
	else:
	st.error("No text content found in the pre-class materials.")
	return None


	def get_chat_history(user_id, session_id):
	query = {
	"user_id": ObjectId(user_id),
	"session_id": session_id,
	"timestamp": {"$lte": datetime.utcnow()}
	}
	result = chat_history_collection.find(query)
	return list(result)

	def json_serializer(obj):
	if isinstance(obj, ObjectId):
	return str(obj)
	raise TypeError(f"Type {type(obj)} not serializable")

	if __name__ == "__main__":
	client = MongoClient(MONGODB_URI)
	db = client["novascholar_db"]
	chat_history_collection = db["chat_history"]
	resources_collection = db["resources"]
	session_id = "S104"
	# Connect to MongoDB
	user_ids = chat_history_collection.distinct("user_id", {"session_id": session_id})
	# Debug print 2: Check user_ids
	print("Found user_ids:", user_ids)

	all_chat_histories = []
	for user_id in user_ids:
	result = get_chat_history(user_id, session_id)
	# Debug print 3: Check each chat history result
	print(f"Chat history for user {user_id}:", "Found" if result else "Not found")
	if result:
	for record in result:
	chat_history = {
	"user_id": record["user_id"], # Convert ObjectId to string
	"session_id": record["session_id"],
	"messages": record["messages"]
	}
	all_chat_histories.append(chat_history)

	print(all_chat_histories)

	# Export all chat histories to a JSON file
	# Path: sample_files/chat_histories.json
	# with open("sample_files/all_chat_histories3.json", "w") as file:
	# json.dump(all_chat_histories, file, indent=2)

	# Debug print 4: Check chat histories
	print("Total chat histories collected:", len(all_chat_histories))

	# Extract topics with debug print
	# topics = extract_topics_from_materials(session_id)
	# # Export extracted topics to a JSON file
	# with open("sample_files/extracted_topics.json", "w") as file:
	# json.dump(topics, file, indent=2)

	# Load extracted topics from JSON file
	with open("sample_files/extracted_topics.json", "r") as file:
	topics = json.load(file)
	# Debug print 5: Check topics
	print("Extracted topics:", topics)

	# Generate analytics

	analytics_generator = NovaScholarAnalytics()
	analytics = analytics_generator.generate_analytics(all_chat_histories, topics)
	# Debug print 6: Check generated analytics
	print("Generated Analytics:", analytics)