File size: 20,923 Bytes
459b8b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
from datetime import datetime
import json
from bson import ObjectId
import typing_extensions as typing
import google.generativeai as genai
from typing import List, Dict, Any
import numpy as np
from collections import defaultdict

from dotenv import load_dotenv
import os
import pymongo
from pymongo import MongoClient

load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_KEY")

class NovaScholarAnalytics:
    def __init__(self, model_name: str = "gemini-1.5-flash"):
        genai.configure(api_key=GEMINI_API_KEY)
        self.model = genai.GenerativeModel(model_name)
    
    def _preprocess_chat_histories(self, chat_histories: List[Dict]) -> List[Dict]:
        # Code 2: 
        """Preprocess chat histories to focus on relevant information."""
        processed = []
        
        for chat in chat_histories:
            # Convert ObjectId to string if it's an ObjectId
            user_id = str(chat["user_id"]["$oid"]) if isinstance(chat["user_id"], dict) and "$oid" in chat["user_id"] else str(chat["user_id"])
            
            try:
                processed_chat = {
                    "user_id": user_id,
                    "messages": [
                        {
                            "prompt": msg["prompt"],
                            "response": msg["response"]
                        }
                        for msg in chat["messages"]
                    ]
                }
                processed.append(processed_chat)
                print(f"Successfully processed chat for user: {user_id}")
            except Exception as e:
                print(f"Error processing chat for user: {user_id}")
                print(f"Error details: {str(e)}")
                continue
                
        return processed
    
    def _create_analytics_prompt(self, chat_histories: List[Dict], all_topics: List[str]) -> str:
        """Creates a structured prompt for Gemini to analyze chat histories."""
        return f"""Analyze the provided student chat histories for a university course and generate concise, actionable analytics WITH EVIDENCE.

        Context:
        - Chat histories: {json.dumps(chat_histories, indent=2)}
        - These are pre-class interactions between students and an AI tutor
        - Topics covered: {', '.join(all_topics)}

        Your task is to provide analytics with supporting evidence from the chat histories.

        Output Format (strictly follow this JSON structure):
        {{
        "topic_wise_insights": [
            {{
            "topic": "<string>",
            "struggling_percentage": <number between 0 and 1>,
            "evidence": {{
                "calculation": "Explain how struggling_percentage was calculated",
                "supporting_messages": [
                    {{
                        "user_id": "<string>",
                        "message": "<string>",
                        "reasoning": "Why this message indicates struggling"
                    }}
                ]
            }},
            "key_issues": ["<string>"],
            "key_misconceptions": ["<string>"],
            "evidence_for_issues": [
                {{
                    "issue": "<string>",
                    "supporting_messages": [
                        {{
                            "user_id": "<string>",
                            "message": "<string>"
                        }}
                    ]
                }}
            ]
            }}
        ],
        "ai_recommended_actions": [
            {{
            "action": "<string>",
            "priority": "high|medium|low",
            "reasoning": "<string>",
            "evidence": {{
                "supporting_messages": [
                    {{
                        "user_id": "<string>",
                        "message": "<string>",
                        "relevance": "Why this message supports the recommendation"
                    }}
                ],
                "pattern_description": "Description of the pattern observed in chat histories"
            }},
            "expected_outcome": "<string>"
            }}
        ],
        "student_analytics": [
            {{
            "student_id": "<string>",
            "engagement_metrics": {{
                "participation_level": <number between 0 and 1>,
                "concept_understanding": "strong|moderate|needs_improvement",
                "question_quality": "advanced|intermediate|basic"
            }},
            "evidence": {{
                "participation_calculation": "Explain how participation_level was calculated",
                "understanding_evidence": [
                    {{
                        "message": "<string>",
                        "analysis": "Why this indicates their understanding level"
                    }}
                ],
                "question_quality_evidence": [
                    {{
                        "question": "<string>",
                        "analysis": "Why this question is classified at this level"
                    }}
                ]
            }},
            "struggling_topics": ["<string>"],
            "personalized_recommendation": "<string>"
            }}
        ]
        }}

        Guidelines for Analysis:
        1. For every insight, recommendation, or metric, provide specific evidence from the chat histories
        2. Explain calculations (e.g., how struggling_percentage was derived)
        3. Include relevant message excerpts that support each conclusion
        4. For recommendations, show the pattern of student interactions that led to that recommendation
        5. When analyzing question quality or understanding, provide reasoning for the classification
        
        The response must adhere strictly to the above JSON structure, with all fields populated appropriately."""

    def _validate_analytics_with_evidence(self, initial_analytics: Dict) -> Dict:
        """Validate the initial analytics by checking evidence."""
        validation_prompt = f"""Review and validate the following analytics based on the provided evidence.
        
        Analytics to validate: {json.dumps(initial_analytics, indent=2)}
        
        For each section:
        1. Verify if the evidence supports the conclusions
        2. Check if calculations (percentages, metrics) are justified by the data
        3. Validate if recommendations are supported by patterns in the chat history
        
        Return a JSON with the same structure, but only include insights/recommendations that have strong supporting evidence.
        For any removed items, include them in a separate "insufficient_evidence" section with explanation."""
        
        try:
            validation_response = self.model.generate_content(
                validation_prompt,
                generation_config=genai.GenerationConfig(
                    response_mime_type="application/json",
                    temperature=0.1
                )
            )
            
            validated_analytics = json.loads(validation_response.text)
            return validated_analytics
            
        except Exception as e:
            print(f"Error in validation: {str(e)}")
            return initial_analytics

    def _enrich_analytics(self, analytics: Dict) -> Dict:
        """Add derived insights and metrics to the validated analytics."""
        try:
            # Calculate class distribution
            total_students = len(analytics.get("student_insights", []))
            performance_distribution = defaultdict(int)
            
            for student in analytics.get("student_insights", []):
                metrics = student.get("engagement_metrics", {})
                understanding = metrics.get("concept_understanding", "moderate")
                
                if understanding == "strong":
                    performance_distribution["high_performers"] += 1
                elif understanding == "needs_improvement":
                    performance_distribution["at_risk"] += 1
                else:
                    performance_distribution["average_performers"] += 1
            
            # Convert to percentages
            class_distribution = {
                level: count/total_students if total_students > 0 else 0
                for level, count in performance_distribution.items()
            }
            
            # Calculate overall engagement
            engagement_sum = sum(
                student.get("engagement_metrics", {}).get("participation_level", 0)
                for student in analytics.get("student_insights", [])
            )
            overall_engagement = engagement_sum / total_students if total_students > 0 else 0
            
            # Identify critical topics (those with high struggling percentage)
            critical_topics = [
                topic["topic"]
                for topic in analytics.get("topic_wise_insights", [])
                if topic.get("struggling_percentage", 0) > 0.7  # 70% threshold
            ]
            
            # Identify students needing intervention
            immediate_attention = []
            monitoring_required = []
            
            for student in analytics.get("student_insights", []):
                student_id = student.get("student_id")
                metrics = student.get("engagement_metrics", {})
                
                # Check for immediate attention needed
                if (metrics.get("concept_understanding") == "needs_improvement" or
                    metrics.get("participation_level", 0) < 0.3 or  # Less than 30% participation
                    len(student.get("struggling_topics", [])) > 2):  # Struggling with more than 2 topics
                    immediate_attention.append(student_id)
                # Check for monitoring
                elif (metrics.get("concept_understanding") == "moderate" or
                    metrics.get("participation_level", 0) < 0.5):  # Less than 50% participation
                    monitoring_required.append(student_id)
            
            # Add enriched data to analytics
            analytics["course_health"] = {
                "overall_engagement": overall_engagement,
                "critical_topics": critical_topics,
                "class_distribution": class_distribution
            }
            
            analytics["intervention_metrics"] = {
                "immediate_attention_needed": immediate_attention,
                "monitoring_required": monitoring_required
            }
            
            # Add evidence for enriched metrics
            analytics["course_health"]["evidence"] = {
                "engagement_calculation": f"Calculated from average participation level of {total_students} students",
                "critical_topics_criteria": "Topics where over 70% of students are struggling",
                "distribution_calculation": "Based on concept understanding levels from student metrics"
            }
            
            analytics["intervention_metrics"]["evidence"] = {
                "immediate_attention_criteria": "Students with low understanding, participation < 30%, or >2 struggling topics",
                "monitoring_criteria": "Students with moderate understanding or participation < 50%"
            }
            
            return analytics
            
        except Exception as e:
            print(f"Error enriching analytics: {str(e)}")
            return analytics  # Return original analytics if enrichment fails

    def generate_analytics(self, chat_histories: List[Dict], all_topics: List[str]) -> Dict:
        """Main method to generate analytics with evidence-based validation."""
        try:
            if not chat_histories or not all_topics:
                print("Missing required input data")
                return self._fallback_analytics()

            try:
                processed_histories = self._preprocess_chat_histories(chat_histories)
                print("Successfully preprocessed chat histories")
            except Exception as preprocess_error:
                print(f"Error in preprocessing: {str(preprocess_error)}")
                return self._fallback_analytics()
            
            try:
                prompt = self._create_analytics_prompt(processed_histories, all_topics)
                print("Successfully created prompt")
                print("Prompt preview:", prompt[:200] + "...") # Print first 200 chars
            except Exception as prompt_error:
                print(f"Error in prompt creation: {str(prompt_error)}")
                return self._fallback_analytics()
            
            # Generate initial analytics with evidence
            # prompt = self._create_analytics_prompt(chat_histories, all_topics)
            response = self.model.generate_content(
                prompt,
                generation_config=genai.GenerationConfig(
                    response_mime_type="application/json",
                    temperature=0.15
                )
            )
            print(response.text)
            
            if not response.text:
                print("Empty response from Gemini")
                return self._fallback_analytics()
            
            # Parse initial analytics
            # initial_analytics = self._process_gemini_response(response.text)
            initial_analytics2 = json.loads(response.text)
            print("Initial analytics:", initial_analytics2)
            # print("Initial analytics type:", type(initial_analytics2))
            # print("Moving to validation...")
            
            # Validate analytics using evidence
            validated_analytics = self._validate_analytics_with_evidence(initial_analytics2)
            
            # # Enrich with additional metrics
            final_analytics = self._enrich_analytics(validated_analytics)
            
            return final_analytics
            
        except Exception as e:
            print(f"Error generating analytics: {str(e)}")
            return self._fallback_analytics()

    def _fallback_analytics(self) -> Dict:
        """Provide fallback analytics with explanation."""
        return {
            "topic_insights": [],
            "student_insights": [],
            "recommended_actions": [
                {
                    "action": "Review analytics generation process",
                    "priority": "high",
                    "target_group": "system_administrators",
                    "reasoning": "Analytics generation failed",
                    "expected_impact": "Restore analytics functionality",
                    "evidence": {
                        "error": "Analytics generation failed to complete"
                    }
                }
            ],
            "course_health": {
                "overall_engagement": 0,
                "critical_topics": [],
                "class_distribution": {
                    "high_performers": 0,
                    "average_performers": 0,
                    "at_risk": 0
                }
            },
            "intervention_metrics": {
                "immediate_attention_needed": [],
                "monitoring_required": []
            }
        }
    def _process_gemini_response(self, response: str) -> Dict:
        print("Entered here")
        try:
            analytics = json.loads(response, object_hook=json_serializer)
            if not isinstance(analytics, dict):
                raise ValueError("Invalid response format")
            return analytics
        except Exception as e:
            print(f"Error processing Gemini response: {str(e)}")
            return self._fallback_analytics()

load_dotenv()
MONGODB_URI = os.getenv("MONGO_URI")
from file_upload_vectorize import model
import streamlit as st

def extract_topics_from_materials(session_id):
    """Extract topics from pre-class materials"""
    materials = resources_collection.find({"session_id": session_id})
    texts = ""
    if materials:
        for material in materials:
            if 'text_content' in material:
                text = material['text_content']
                texts += text + "\n"
            else:
                st.warning("No text content found in the material.")
                return
    else:
        st.error("No pre-class materials found for this session.")
        return

    if texts:
        context_prompt = f"""
        Task: Extract Comprehensive Topics in a List Format
        You are tasked with analyzing the provided text content and extracting a detailed, flat list of topics.

        Instructions:
        Identify All Topics: Extract a comprehensive list of all topics, subtopics, and indirect topics present in the provided text content. This list should include:

        Overarching themes
        Main topics
        Subtopics and their sub-subtopics
        Indirectly related topics
        Flat List Format: Provide a flat list where each item is a topic. Ensure topics at all levels (overarching, main, sub, sub-sub, indirect) are represented as individual entries in the list.

        Be Exhaustive: Ensure the response captures every topic, subtopic, and indirectly related concept comprehensively.

        Output Requirements:
        Use this structure:
        {{
            "topics": [
                "Topic 1",
                "Topic 2",
                "Topic 3",
                ...
            ]
        }}
        Do Not Include: Do not include backticks, hierarchical structures, or the word 'json' in your response.

        Content to Analyze:
        {texts}
        """
        try:
            # response = model.generate_content(context_prompt, generation_config=genai.GenerationConfig(response_mime_type="application/json", response_schema=list[Topics]))
            response = model.generate_content(context_prompt, generation_config=genai.GenerationConfig(temperature=0.3))
            if not response or not response.text:
                st.error("Error extracting topics from materials.")
                return
            
            topics = response.text
            return topics
        except Exception as e:
            st.error(f"Error extracting topics: {str(e)}")
            return None
    else:
        st.error("No text content found in the pre-class materials.")
        return None


def get_chat_history(user_id, session_id):
    query = {
        "user_id": ObjectId(user_id),
        "session_id": session_id,
        "timestamp": {"$lte": datetime.utcnow()}
    }
    result = chat_history_collection.find(query)
    return list(result)

def json_serializer(obj):
    if isinstance(obj, ObjectId):
        return str(obj)
    raise TypeError(f"Type {type(obj)} not serializable")

if __name__ == "__main__":
    client = MongoClient(MONGODB_URI)
    db = client["novascholar_db"]
    chat_history_collection = db["chat_history"]
    resources_collection = db["resources"]
    session_id = "S104"
    # Connect to MongoDB
    user_ids = chat_history_collection.distinct("user_id", {"session_id": session_id})
    # Debug print 2: Check user_ids
    print("Found user_ids:", user_ids)
    
    all_chat_histories = []
    for user_id in user_ids:
        result = get_chat_history(user_id, session_id)
        # Debug print 3: Check each chat history result
        print(f"Chat history for user {user_id}:", "Found" if result else "Not found")
        if result:
            for record in result:
                chat_history = {
                    "user_id": record["user_id"],  # Convert ObjectId to string
                    "session_id": record["session_id"],
                    "messages": record["messages"]
                }
                all_chat_histories.append(chat_history)

    print(all_chat_histories)

    # Export all chat histories to a JSON file
    # Path: sample_files/chat_histories.json
    # with open("sample_files/all_chat_histories3.json", "w") as file:
    #     json.dump(all_chat_histories, file, indent=2)

    # Debug print 4: Check chat histories
    print("Total chat histories collected:", len(all_chat_histories))

    # Extract topics with debug print
    # topics = extract_topics_from_materials(session_id)
    # # Export extracted topics to a JSON file
    # with open("sample_files/extracted_topics.json", "w") as file:
    #     json.dump(topics, file, indent=2)

    # Load extracted topics from JSON file
    with open("sample_files/extracted_topics.json", "r") as file:
        topics = json.load(file)
    # Debug print 5: Check topics
    print("Extracted topics:", topics)
    
    # Generate analytics

    analytics_generator = NovaScholarAnalytics()
    analytics = analytics_generator.generate_analytics(all_chat_histories, topics)
    # Debug print 6: Check generated analytics
    print("Generated Analytics:", analytics)