Harshal Vhatkar commited on
Commit
7dd7d88
·
1 Parent(s): e7247bf

improve video upload

Browse files
file_upload_vectorize.py CHANGED
@@ -141,12 +141,18 @@ def extract_text_from_file(uploaded_file):
141
  st.error(f"Error processing file: {str(e)}")
142
  return None
143
 
 
 
144
  def get_embedding(text):
145
- response = openai.embeddings.create(
146
- model="text-embedding-ada-002",
147
- input=text
148
- )
149
- return response.data[0].embedding
 
 
 
 
150
 
151
  def create_vector_store(text, resource_id):
152
  # resource_object_id = ObjectId(resource_id)
@@ -169,11 +175,19 @@ def create_vector_store(text, resource_id):
169
 
170
  vector_data = {
171
  "resource_id": resource_id,
172
- "vector": embedding,
173
  "text": text,
174
  "created_at": datetime.utcnow()
175
  }
176
 
177
- vectors_collection.insert_one(vector_data)
178
 
179
- # return VectorStoreIndex.from_documents([document])
 
 
 
 
 
 
 
 
 
 
141
  st.error(f"Error processing file: {str(e)}")
142
  return None
143
 
144
+ from sentence_transformers import SentenceTransformer
145
+
146
  def get_embedding(text):
147
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
148
+ embeddings = model.encode(text)
149
+ # response = openai.embeddings.create(
150
+ # model="text-embedding-ada-002",
151
+ # input=text
152
+ # )
153
+ # return response.data[0].embedding
154
+ return embeddings
155
+
156
 
157
  def create_vector_store(text, resource_id):
158
  # resource_object_id = ObjectId(resource_id)
 
175
 
176
  vector_data = {
177
  "resource_id": resource_id,
178
+ "vector": embedding.tolist(),
179
  "text": text,
180
  "created_at": datetime.utcnow()
181
  }
182
 
 
183
 
184
+ # vectors_collection.insert_one(vector_data)
185
+ # Store in MongoDB
186
+ try:
187
+ vectors_collection.insert_one(vector_data)
188
+ except Exception as db_error:
189
+ st.error(f"Database error: {str(db_error)}")
190
+ return None
191
+
192
+ # return VectorStoreIndex.from_documents([document])
193
+ return vector_data
pre_class_analytics3.py DELETED
@@ -1,499 +0,0 @@
1
- from datetime import datetime
2
- import json
3
- from bson import ObjectId
4
- import typing_extensions as typing
5
- import google.generativeai as genai
6
- from typing import List, Dict, Any
7
- import numpy as np
8
- from collections import defaultdict
9
-
10
- from dotenv import load_dotenv
11
- import os
12
- import pymongo
13
- from pymongo import MongoClient
14
-
15
- load_dotenv()
16
- GEMINI_API_KEY = os.getenv("GEMINI_KEY")
17
-
18
- class NovaScholarAnalytics:
19
- def __init__(self, model_name: str = "gemini-1.5-flash"):
20
- genai.configure(api_key=GEMINI_API_KEY)
21
- self.model = genai.GenerativeModel(model_name)
22
-
23
- def _preprocess_chat_histories(self, chat_histories: List[Dict]) -> List[Dict]:
24
- # Code 2:
25
- """Preprocess chat histories to focus on relevant information."""
26
- processed = []
27
-
28
- for chat in chat_histories:
29
- # Convert ObjectId to string if it's an ObjectId
30
- user_id = str(chat["user_id"]["$oid"]) if isinstance(chat["user_id"], dict) and "$oid" in chat["user_id"] else str(chat["user_id"])
31
-
32
- try:
33
- processed_chat = {
34
- "user_id": user_id,
35
- "messages": [
36
- {
37
- "prompt": msg["prompt"],
38
- "response": msg["response"]
39
- }
40
- for msg in chat["messages"]
41
- ]
42
- }
43
- processed.append(processed_chat)
44
- print(f"Successfully processed chat for user: {user_id}")
45
- except Exception as e:
46
- print(f"Error processing chat for user: {user_id}")
47
- print(f"Error details: {str(e)}")
48
- continue
49
-
50
- return processed
51
-
52
- def _create_analytics_prompt(self, chat_histories: List[Dict], all_topics: List[str]) -> str:
53
- """Creates a structured prompt for Gemini to analyze chat histories."""
54
- return f"""Analyze the provided student chat histories for a university course and generate concise, actionable analytics WITH EVIDENCE.
55
-
56
- Context:
57
- - Chat histories: {json.dumps(chat_histories, indent=2)}
58
- - These are pre-class interactions between students and an AI tutor
59
- - Topics covered: {', '.join(all_topics)}
60
-
61
- Your task is to provide analytics with supporting evidence from the chat histories.
62
-
63
- Output Format (strictly follow this JSON structure):
64
- {{
65
- "topic_wise_insights": [
66
- {{
67
- "topic": "<string>",
68
- "struggling_percentage": <number between 0 and 1>,
69
- "evidence": {{
70
- "calculation": "Explain how struggling_percentage was calculated",
71
- "supporting_messages": [
72
- {{
73
- "user_id": "<string>",
74
- "message": "<string>",
75
- "reasoning": "Why this message indicates struggling"
76
- }}
77
- ]
78
- }},
79
- "key_issues": ["<string>"],
80
- "key_misconceptions": ["<string>"],
81
- "evidence_for_issues": [
82
- {{
83
- "issue": "<string>",
84
- "supporting_messages": [
85
- {{
86
- "user_id": "<string>",
87
- "message": "<string>"
88
- }}
89
- ]
90
- }}
91
- ]
92
- }}
93
- ],
94
- "ai_recommended_actions": [
95
- {{
96
- "action": "<string>",
97
- "priority": "high|medium|low",
98
- "reasoning": "<string>",
99
- "evidence": {{
100
- "supporting_messages": [
101
- {{
102
- "user_id": "<string>",
103
- "message": "<string>",
104
- "relevance": "Why this message supports the recommendation"
105
- }}
106
- ],
107
- "pattern_description": "Description of the pattern observed in chat histories"
108
- }},
109
- "expected_outcome": "<string>"
110
- }}
111
- ],
112
- "student_analytics": [
113
- {{
114
- "student_id": "<string>",
115
- "engagement_metrics": {{
116
- "participation_level": <number between 0 and 1>,
117
- "concept_understanding": "strong|moderate|needs_improvement",
118
- "question_quality": "advanced|intermediate|basic"
119
- }},
120
- "evidence": {{
121
- "participation_calculation": "Explain how participation_level was calculated",
122
- "understanding_evidence": [
123
- {{
124
- "message": "<string>",
125
- "analysis": "Why this indicates their understanding level"
126
- }}
127
- ],
128
- "question_quality_evidence": [
129
- {{
130
- "question": "<string>",
131
- "analysis": "Why this question is classified at this level"
132
- }}
133
- ]
134
- }},
135
- "struggling_topics": ["<string>"],
136
- "personalized_recommendation": "<string>"
137
- }}
138
- ]
139
- }}
140
-
141
- Guidelines for Analysis:
142
- 1. For every insight, recommendation, or metric, provide specific evidence from the chat histories
143
- 2. Explain calculations (e.g., how struggling_percentage was derived)
144
- 3. Include relevant message excerpts that support each conclusion
145
- 4. For recommendations, show the pattern of student interactions that led to that recommendation
146
- 5. When analyzing question quality or understanding, provide reasoning for the classification
147
-
148
- The response must adhere strictly to the above JSON structure, with all fields populated appropriately."""
149
-
150
- def _validate_analytics_with_evidence(self, initial_analytics: Dict) -> Dict:
151
- """Validate the initial analytics by checking evidence."""
152
- validation_prompt = f"""Review and validate the following analytics based on the provided evidence.
153
-
154
- Analytics to validate: {json.dumps(initial_analytics, indent=2)}
155
-
156
- For each section:
157
- 1. Verify if the evidence supports the conclusions
158
- 2. Check if calculations (percentages, metrics) are justified by the data
159
- 3. Validate if recommendations are supported by patterns in the chat history
160
-
161
- Return a JSON with the same structure, but only include insights/recommendations that have strong supporting evidence.
162
- For any removed items, include them in a separate "insufficient_evidence" section with explanation."""
163
-
164
- try:
165
- validation_response = self.model.generate_content(
166
- validation_prompt,
167
- generation_config=genai.GenerationConfig(
168
- response_mime_type="application/json",
169
- temperature=0.1
170
- )
171
- )
172
-
173
- validated_analytics = json.loads(validation_response.text)
174
- return validated_analytics
175
-
176
- except Exception as e:
177
- print(f"Error in validation: {str(e)}")
178
- return initial_analytics
179
-
180
- def _enrich_analytics(self, analytics: Dict) -> Dict:
181
- """Add derived insights and metrics to the validated analytics."""
182
- try:
183
- # Calculate class distribution
184
- total_students = len(analytics.get("student_insights", []))
185
- performance_distribution = defaultdict(int)
186
-
187
- for student in analytics.get("student_insights", []):
188
- metrics = student.get("engagement_metrics", {})
189
- understanding = metrics.get("concept_understanding", "moderate")
190
-
191
- if understanding == "strong":
192
- performance_distribution["high_performers"] += 1
193
- elif understanding == "needs_improvement":
194
- performance_distribution["at_risk"] += 1
195
- else:
196
- performance_distribution["average_performers"] += 1
197
-
198
- # Convert to percentages
199
- class_distribution = {
200
- level: count/total_students if total_students > 0 else 0
201
- for level, count in performance_distribution.items()
202
- }
203
-
204
- # Calculate overall engagement
205
- engagement_sum = sum(
206
- student.get("engagement_metrics", {}).get("participation_level", 0)
207
- for student in analytics.get("student_insights", [])
208
- )
209
- overall_engagement = engagement_sum / total_students if total_students > 0 else 0
210
-
211
- # Identify critical topics (those with high struggling percentage)
212
- critical_topics = [
213
- topic["topic"]
214
- for topic in analytics.get("topic_wise_insights", [])
215
- if topic.get("struggling_percentage", 0) > 0.7 # 70% threshold
216
- ]
217
-
218
- # Identify students needing intervention
219
- immediate_attention = []
220
- monitoring_required = []
221
-
222
- for student in analytics.get("student_insights", []):
223
- student_id = student.get("student_id")
224
- metrics = student.get("engagement_metrics", {})
225
-
226
- # Check for immediate attention needed
227
- if (metrics.get("concept_understanding") == "needs_improvement" or
228
- metrics.get("participation_level", 0) < 0.3 or # Less than 30% participation
229
- len(student.get("struggling_topics", [])) > 2): # Struggling with more than 2 topics
230
- immediate_attention.append(student_id)
231
- # Check for monitoring
232
- elif (metrics.get("concept_understanding") == "moderate" or
233
- metrics.get("participation_level", 0) < 0.5): # Less than 50% participation
234
- monitoring_required.append(student_id)
235
-
236
- # Add enriched data to analytics
237
- analytics["course_health"] = {
238
- "overall_engagement": overall_engagement,
239
- "critical_topics": critical_topics,
240
- "class_distribution": class_distribution
241
- }
242
-
243
- analytics["intervention_metrics"] = {
244
- "immediate_attention_needed": immediate_attention,
245
- "monitoring_required": monitoring_required
246
- }
247
-
248
- # Add evidence for enriched metrics
249
- analytics["course_health"]["evidence"] = {
250
- "engagement_calculation": f"Calculated from average participation level of {total_students} students",
251
- "critical_topics_criteria": "Topics where over 70% of students are struggling",
252
- "distribution_calculation": "Based on concept understanding levels from student metrics"
253
- }
254
-
255
- analytics["intervention_metrics"]["evidence"] = {
256
- "immediate_attention_criteria": "Students with low understanding, participation < 30%, or >2 struggling topics",
257
- "monitoring_criteria": "Students with moderate understanding or participation < 50%"
258
- }
259
-
260
- return analytics
261
-
262
- except Exception as e:
263
- print(f"Error enriching analytics: {str(e)}")
264
- return analytics # Return original analytics if enrichment fails
265
-
266
- def generate_analytics(self, chat_histories: List[Dict], all_topics: List[str]) -> Dict:
267
- """Main method to generate analytics with evidence-based validation."""
268
- try:
269
- if not chat_histories or not all_topics:
270
- print("Missing required input data")
271
- return self._fallback_analytics()
272
-
273
- try:
274
- processed_histories = self._preprocess_chat_histories(chat_histories)
275
- print("Successfully preprocessed chat histories")
276
- except Exception as preprocess_error:
277
- print(f"Error in preprocessing: {str(preprocess_error)}")
278
- return self._fallback_analytics()
279
-
280
- try:
281
- prompt = self._create_analytics_prompt(processed_histories, all_topics)
282
- print("Successfully created prompt")
283
- print("Prompt preview:", prompt[:200] + "...") # Print first 200 chars
284
- except Exception as prompt_error:
285
- print(f"Error in prompt creation: {str(prompt_error)}")
286
- return self._fallback_analytics()
287
-
288
- # Generate initial analytics with evidence
289
- # prompt = self._create_analytics_prompt(chat_histories, all_topics)
290
- response = self.model.generate_content(
291
- prompt,
292
- generation_config=genai.GenerationConfig(
293
- response_mime_type="application/json",
294
- temperature=0.15
295
- )
296
- )
297
- print(response.text)
298
-
299
- if not response.text:
300
- print("Empty response from Gemini")
301
- return self._fallback_analytics()
302
-
303
- # Parse initial analytics
304
- # initial_analytics = self._process_gemini_response(response.text)
305
- initial_analytics2 = json.loads(response.text)
306
- print("Initial analytics:", initial_analytics2)
307
- # print("Initial analytics type:", type(initial_analytics2))
308
- # print("Moving to validation...")
309
-
310
- # Validate analytics using evidence
311
- validated_analytics = self._validate_analytics_with_evidence(initial_analytics2)
312
-
313
- # # Enrich with additional metrics
314
- final_analytics = self._enrich_analytics(validated_analytics)
315
-
316
- return final_analytics
317
-
318
- except Exception as e:
319
- print(f"Error generating analytics: {str(e)}")
320
- return self._fallback_analytics()
321
-
322
- def _fallback_analytics(self) -> Dict:
323
- """Provide fallback analytics with explanation."""
324
- return {
325
- "topic_insights": [],
326
- "student_insights": [],
327
- "recommended_actions": [
328
- {
329
- "action": "Review analytics generation process",
330
- "priority": "high",
331
- "target_group": "system_administrators",
332
- "reasoning": "Analytics generation failed",
333
- "expected_impact": "Restore analytics functionality",
334
- "evidence": {
335
- "error": "Analytics generation failed to complete"
336
- }
337
- }
338
- ],
339
- "course_health": {
340
- "overall_engagement": 0,
341
- "critical_topics": [],
342
- "class_distribution": {
343
- "high_performers": 0,
344
- "average_performers": 0,
345
- "at_risk": 0
346
- }
347
- },
348
- "intervention_metrics": {
349
- "immediate_attention_needed": [],
350
- "monitoring_required": []
351
- }
352
- }
353
- def _process_gemini_response(self, response: str) -> Dict:
354
- print("Entered here")
355
- try:
356
- analytics = json.loads(response, object_hook=json_serializer)
357
- if not isinstance(analytics, dict):
358
- raise ValueError("Invalid response format")
359
- return analytics
360
- except Exception as e:
361
- print(f"Error processing Gemini response: {str(e)}")
362
- return self._fallback_analytics()
363
-
364
- load_dotenv()
365
- MONGODB_URI = os.getenv("MONGO_URI")
366
- from file_upload_vectorize import model
367
- import streamlit as st
368
-
369
- def extract_topics_from_materials(session_id):
370
- """Extract topics from pre-class materials"""
371
- materials = resources_collection.find({"session_id": session_id})
372
- texts = ""
373
- if materials:
374
- for material in materials:
375
- if 'text_content' in material:
376
- text = material['text_content']
377
- texts += text + "\n"
378
- else:
379
- st.warning("No text content found in the material.")
380
- return
381
- else:
382
- st.error("No pre-class materials found for this session.")
383
- return
384
-
385
- if texts:
386
- context_prompt = f"""
387
- Task: Extract Comprehensive Topics in a List Format
388
- You are tasked with analyzing the provided text content and extracting a detailed, flat list of topics.
389
-
390
- Instructions:
391
- Identify All Topics: Extract a comprehensive list of all topics, subtopics, and indirect topics present in the provided text content. This list should include:
392
-
393
- Overarching themes
394
- Main topics
395
- Subtopics and their sub-subtopics
396
- Indirectly related topics
397
- Flat List Format: Provide a flat list where each item is a topic. Ensure topics at all levels (overarching, main, sub, sub-sub, indirect) are represented as individual entries in the list.
398
-
399
- Be Exhaustive: Ensure the response captures every topic, subtopic, and indirectly related concept comprehensively.
400
-
401
- Output Requirements:
402
- Use this structure:
403
- {{
404
- "topics": [
405
- "Topic 1",
406
- "Topic 2",
407
- "Topic 3",
408
- ...
409
- ]
410
- }}
411
- Do Not Include: Do not include backticks, hierarchical structures, or the word 'json' in your response.
412
-
413
- Content to Analyze:
414
- {texts}
415
- """
416
- try:
417
- # response = model.generate_content(context_prompt, generation_config=genai.GenerationConfig(response_mime_type="application/json", response_schema=list[Topics]))
418
- response = model.generate_content(context_prompt, generation_config=genai.GenerationConfig(temperature=0.3))
419
- if not response or not response.text:
420
- st.error("Error extracting topics from materials.")
421
- return
422
-
423
- topics = response.text
424
- return topics
425
- except Exception as e:
426
- st.error(f"Error extracting topics: {str(e)}")
427
- return None
428
- else:
429
- st.error("No text content found in the pre-class materials.")
430
- return None
431
-
432
-
433
- def get_chat_history(user_id, session_id):
434
- query = {
435
- "user_id": ObjectId(user_id),
436
- "session_id": session_id,
437
- "timestamp": {"$lte": datetime.utcnow()}
438
- }
439
- result = chat_history_collection.find(query)
440
- return list(result)
441
-
442
- def json_serializer(obj):
443
- if isinstance(obj, ObjectId):
444
- return str(obj)
445
- raise TypeError(f"Type {type(obj)} not serializable")
446
-
447
- if __name__ == "__main__":
448
- client = MongoClient(MONGODB_URI)
449
- db = client["novascholar_db"]
450
- chat_history_collection = db["chat_history"]
451
- resources_collection = db["resources"]
452
- session_id = "S104"
453
- # Connect to MongoDB
454
- user_ids = chat_history_collection.distinct("user_id", {"session_id": session_id})
455
- # Debug print 2: Check user_ids
456
- print("Found user_ids:", user_ids)
457
-
458
- all_chat_histories = []
459
- for user_id in user_ids:
460
- result = get_chat_history(user_id, session_id)
461
- # Debug print 3: Check each chat history result
462
- print(f"Chat history for user {user_id}:", "Found" if result else "Not found")
463
- if result:
464
- for record in result:
465
- chat_history = {
466
- "user_id": record["user_id"], # Convert ObjectId to string
467
- "session_id": record["session_id"],
468
- "messages": record["messages"]
469
- }
470
- all_chat_histories.append(chat_history)
471
-
472
- print(all_chat_histories)
473
-
474
- # Export all chat histories to a JSON file
475
- # Path: sample_files/chat_histories.json
476
- # with open("sample_files/all_chat_histories3.json", "w") as file:
477
- # json.dump(all_chat_histories, file, indent=2)
478
-
479
- # Debug print 4: Check chat histories
480
- print("Total chat histories collected:", len(all_chat_histories))
481
-
482
- # Extract topics with debug print
483
- # topics = extract_topics_from_materials(session_id)
484
- # # Export extracted topics to a JSON file
485
- # with open("sample_files/extracted_topics.json", "w") as file:
486
- # json.dump(topics, file, indent=2)
487
-
488
- # Load extracted topics from JSON file
489
- with open("sample_files/extracted_topics.json", "r") as file:
490
- topics = json.load(file)
491
- # Debug print 5: Check topics
492
- print("Extracted topics:", topics)
493
-
494
- # Generate analytics
495
-
496
- analytics_generator = NovaScholarAnalytics()
497
- analytics = analytics_generator.generate_analytics(all_chat_histories, topics)
498
- # Debug print 6: Check generated analytics
499
- print("Generated Analytics:", analytics)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pre_class_analytics4.py DELETED
@@ -1,592 +0,0 @@
1
- import pandas as pd
2
- import numpy as np
3
- from datetime import datetime
4
- from typing import List, Dict, Any, Tuple
5
- import spacy
6
- from collections import Counter, defaultdict
7
- from sklearn.feature_extraction.text import TfidfVectorizer
8
- from sklearn.metrics.pairwise import cosine_similarity
9
- from textblob import TextBlob
10
- import networkx as nx
11
- from scipy import stats
12
- import logging
13
- import json
14
- from dataclasses import dataclass
15
- from enum import Enum
16
-
17
- # Configure logging
18
- logging.basicConfig(level=logging.INFO)
19
- logger = logging.getLogger(__name__)
20
-
21
- class TopicDifficulty(Enum):
22
- EASY = "easy"
23
- MODERATE = "moderate"
24
- DIFFICULT = "difficult"
25
- VERY_DIFFICULT = "very_difficult"
26
-
27
-
28
- @dataclass
29
- class QuestionMetrics:
30
- complexity_score: float
31
- follow_up_count: int
32
- clarification_count: int
33
- time_spent: float
34
- sentiment_score: float
35
-
36
- @dataclass
37
- class TopicInsights:
38
- difficulty_level: TopicDifficulty
39
- common_confusion_points: List[str]
40
- question_patterns: List[str]
41
- time_distribution: Dict[str, float]
42
- engagement_metrics: Dict[str, float]
43
- recommended_focus_areas: List[str]
44
-
45
- def to_dict(self):
46
- return {
47
- "difficulty_level": self.difficulty_level.value, # Convert enum to its value
48
- "common_confusion_points": self.common_confusion_points,
49
- "question_patterns": self.question_patterns,
50
- "time_distribution": {str(k): v for k, v in self.time_distribution.items()},
51
- "engagement_metrics": self.engagement_metrics,
52
- "recommended_focus_areas": self.recommended_focus_areas,
53
- }
54
-
55
- class PreClassAnalytics:
56
- def __init__(self, nlp_model: str = "en_core_web_lg"):
57
- """Initialize the analytics system with necessary components."""
58
- self.nlp = spacy.load(nlp_model)
59
- self.question_indicators = {
60
- "what", "why", "how", "when", "where", "which", "who",
61
- "whose", "whom", "can", "could", "would", "will", "explain"
62
- }
63
- self.confusion_indicators = {
64
- "confused", "don't understand", "unclear", "not clear",
65
- "stuck", "difficult", "hard", "help", "explain again"
66
- }
67
- self.follow_up_indicators = {
68
- "also", "another", "additionally", "furthermore", "moreover",
69
- "besides", "related", "similarly", "again"
70
- }
71
-
72
- def preprocess_chat_history(self, chat_history: List[Dict]) -> pd.DataFrame:
73
- """Convert chat history to DataFrame with enhanced features."""
74
- messages = []
75
- for chat in chat_history:
76
- user_id = chat['user_id']['$oid']
77
- for msg in chat['messages']:
78
- try:
79
- # Ensure the timestamp is in the correct format
80
- if isinstance(msg['timestamp'], dict) and '$date' in msg['timestamp']:
81
- timestamp = pd.to_datetime(msg['timestamp']['$date'])
82
- elif isinstance(msg['timestamp'], str):
83
- timestamp = pd.to_datetime(msg['timestamp'])
84
- else:
85
- raise ValueError("Invalid timestamp format")
86
- except Exception as e:
87
- print(f"Error parsing timestamp: {msg['timestamp']}, error: {e}")
88
- timestamp = pd.NaT # Use NaT (Not a Time) for invalid timestamps
89
-
90
- messages.append({
91
- 'user_id': user_id,
92
- 'timestamp': timestamp,
93
- 'prompt': msg['prompt'],
94
- 'response': msg['response'],
95
- 'is_question': any(q in msg['prompt'].lower() for q in self.question_indicators),
96
- 'shows_confusion': any(c in msg['prompt'].lower() for c in self.confusion_indicators),
97
- 'is_followup': any(f in msg['prompt'].lower() for f in self.follow_up_indicators)
98
- })
99
-
100
- df = pd.DataFrame(messages)
101
- df['sentiment'] = df['prompt'].apply(lambda x: TextBlob(x).sentiment.polarity)
102
- return df
103
-
104
- def extract_topic_hierarchies(self, df: pd.DataFrame) -> Dict[str, List[str]]:
105
- """Extract hierarchical topic relationships from conversations."""
106
- topic_hierarchy = defaultdict(list)
107
-
108
- for _, row in df.iterrows():
109
- doc = self.nlp(row['prompt'])
110
-
111
- # Extract main topics and subtopics using noun chunks and dependencies
112
- main_topics = []
113
- subtopics = []
114
-
115
- for chunk in doc.noun_chunks:
116
- if chunk.root.dep_ in ('nsubj', 'dobj'):
117
- main_topics.append(chunk.text.lower())
118
- else:
119
- subtopics.append(chunk.text.lower())
120
-
121
- # Build hierarchy
122
- for main_topic in main_topics:
123
- topic_hierarchy[main_topic].extend(subtopics)
124
-
125
- # Clean and deduplicate
126
- return {k: list(set(v)) for k, v in topic_hierarchy.items()}
127
-
128
- def analyze_topic_difficulty(self, df: pd.DataFrame, topic: str) -> TopicDifficulty:
129
- """Determine topic difficulty based on various metrics."""
130
- topic_msgs = df[df['prompt'].str.contains(topic, case=False)]
131
-
132
- # Calculate difficulty indicators
133
- confusion_rate = topic_msgs['shows_confusion'].mean()
134
- question_rate = topic_msgs['is_question'].mean()
135
- follow_up_rate = topic_msgs['is_followup'].mean()
136
- avg_sentiment = topic_msgs['sentiment'].mean()
137
-
138
- # Calculate composite difficulty score
139
- difficulty_score = (
140
- confusion_rate * 0.4 +
141
- question_rate * 0.3 +
142
- follow_up_rate * 0.2 +
143
- (1 - (avg_sentiment + 1) / 2) * 0.1
144
- )
145
-
146
- # Map score to difficulty level
147
- if difficulty_score < 0.3:
148
- return TopicDifficulty.EASY
149
- elif difficulty_score < 0.5:
150
- return TopicDifficulty.MODERATE
151
- elif difficulty_score < 0.7:
152
- return TopicDifficulty.DIFFICULT
153
- else:
154
- return TopicDifficulty.VERY_DIFFICULT
155
-
156
- def identify_confusion_patterns(self, df: pd.DataFrame, topic: str) -> List[str]:
157
- """Identify common patterns in student confusion."""
158
- confused_msgs = df[
159
- (df['prompt'].str.contains(topic, case=False)) &
160
- (df['shows_confusion'])
161
- ]['prompt']
162
-
163
- patterns = []
164
- for msg in confused_msgs:
165
- doc = self.nlp(msg)
166
-
167
- # Extract key phrases around confusion indicators
168
- for sent in doc.sents:
169
- for token in sent:
170
- if token.text.lower() in self.confusion_indicators:
171
- # Get context window around confusion indicator
172
- context = sent.text
173
- patterns.append(context)
174
-
175
- # Group similar patterns
176
- if patterns:
177
- vectorizer = TfidfVectorizer(ngram_range=(1, 3))
178
- tfidf_matrix = vectorizer.fit_transform(patterns)
179
- similarity_matrix = cosine_similarity(tfidf_matrix)
180
-
181
- # Cluster similar patterns
182
- G = nx.Graph()
183
- for i in range(len(patterns)):
184
- for j in range(i + 1, len(patterns)):
185
- if similarity_matrix[i][j] > 0.5: # Similarity threshold
186
- G.add_edge(i, j)
187
-
188
- # Extract representative patterns from each cluster
189
- clusters = list(nx.connected_components(G))
190
- return [patterns[min(cluster)] for cluster in clusters]
191
-
192
- return []
193
-
194
- def analyze_question_patterns(self, df: pd.DataFrame, topic: str) -> List[str]:
195
- """Analyze patterns in student questions about the topic."""
196
- topic_questions = df[
197
- (df['prompt'].str.contains(topic, case=False)) &
198
- (df['is_question'])
199
- ]['prompt']
200
-
201
- question_types = defaultdict(list)
202
- for question in topic_questions:
203
- doc = self.nlp(question)
204
-
205
- # Categorize questions
206
- if any(token.text.lower() in {"what", "define", "explain"} for token in doc):
207
- question_types["conceptual"].append(question)
208
- elif any(token.text.lower() in {"how", "steps", "process"} for token in doc):
209
- question_types["procedural"].append(question)
210
- elif any(token.text.lower() in {"why", "reason", "because"} for token in doc):
211
- question_types["reasoning"].append(question)
212
- else:
213
- question_types["other"].append(question)
214
-
215
- # Extract patterns from each category
216
- patterns = []
217
- for category, questions in question_types.items():
218
- if questions:
219
- vectorizer = TfidfVectorizer(ngram_range=(1, 3))
220
- tfidf_matrix = vectorizer.fit_transform(questions)
221
-
222
- # Get most representative questions
223
- feature_array = np.mean(tfidf_matrix.toarray(), axis=0)
224
- tfidf_sorting = np.argsort(feature_array)[::-1]
225
- features = vectorizer.get_feature_names_out()
226
-
227
- patterns.append(f"{category}: {' '.join(features[tfidf_sorting[:3]])}")
228
-
229
- return patterns
230
-
231
- def analyze_time_distribution(self, df: pd.DataFrame, topic: str) -> Dict[str, float]:
232
- """Analyze time spent on different aspects of the topic."""
233
- topic_msgs = df[df['prompt'].str.contains(topic, case=False)].copy()
234
- if len(topic_msgs) < 2:
235
- return {}
236
-
237
- topic_msgs['time_diff'] = topic_msgs['timestamp'].diff()
238
-
239
- # Calculate time distribution
240
- distribution = {
241
- 'total_time': topic_msgs['time_diff'].sum().total_seconds() / 60,
242
- 'avg_time_per_message': topic_msgs['time_diff'].mean().total_seconds() / 60,
243
- 'max_time_gap': topic_msgs['time_diff'].max().total_seconds() / 60,
244
- 'time_spent_on_questions': topic_msgs[topic_msgs['is_question']]['time_diff'].sum().total_seconds() / 60,
245
- 'time_spent_on_confusion': topic_msgs[topic_msgs['shows_confusion']]['time_diff'].sum().total_seconds() / 60
246
- }
247
-
248
- return distribution
249
-
250
- def calculate_engagement_metrics(self, df: pd.DataFrame, topic: str) -> Dict[str, float]:
251
- """Calculate student engagement metrics for the topic."""
252
- topic_msgs = df[df['prompt'].str.contains(topic, case=False)]
253
-
254
- metrics = {
255
- 'message_count': len(topic_msgs),
256
- 'question_ratio': topic_msgs['is_question'].mean(),
257
- 'confusion_ratio': topic_msgs['shows_confusion'].mean(),
258
- 'follow_up_ratio': topic_msgs['is_followup'].mean(),
259
- 'avg_sentiment': topic_msgs['sentiment'].mean(),
260
- 'engagement_score': 0.0 # Will be calculated below
261
- }
262
-
263
- # Calculate engagement score
264
- metrics['engagement_score'] = (
265
- metrics['message_count'] * 0.3 +
266
- metrics['question_ratio'] * 0.25 +
267
- metrics['follow_up_ratio'] * 0.25 +
268
- (metrics['avg_sentiment'] + 1) / 2 * 0.2 # Normalize sentiment to 0-1
269
- )
270
-
271
- return metrics
272
-
273
- def generate_topic_insights(self, df: pd.DataFrame, topic: str) -> TopicInsights:
274
- """Generate comprehensive insights for a topic."""
275
- difficulty = self.analyze_topic_difficulty(df, topic)
276
- confusion_points = self.identify_confusion_patterns(df, topic)
277
- question_patterns = self.analyze_question_patterns(df, topic)
278
- time_distribution = self.analyze_time_distribution(df, topic)
279
- engagement_metrics = self.calculate_engagement_metrics(df, topic)
280
-
281
- # Generate recommended focus areas based on insights
282
- focus_areas = []
283
-
284
- if difficulty in (TopicDifficulty.DIFFICULT, TopicDifficulty.VERY_DIFFICULT):
285
- focus_areas.append("Fundamental concept reinforcement needed")
286
-
287
- if confusion_points:
288
- focus_areas.append(f"Address common confusion around: {', '.join(confusion_points[:3])}")
289
-
290
- if engagement_metrics['confusion_ratio'] > 0.3:
291
- focus_areas.append("Consider alternative teaching approaches")
292
-
293
- if time_distribution.get('time_spent_on_questions', 0) > time_distribution.get('total_time', 0) * 0.5:
294
- focus_areas.append("More practical examples or demonstrations needed")
295
-
296
- return TopicInsights(
297
- difficulty_level=difficulty,
298
- common_confusion_points=confusion_points,
299
- question_patterns=question_patterns,
300
- time_distribution=time_distribution,
301
- engagement_metrics=engagement_metrics,
302
- recommended_focus_areas=focus_areas
303
- )
304
-
305
- def analyze_student_progress(self, df: pd.DataFrame) -> Dict[str, Any]:
306
- """Analyze individual student progress and learning patterns."""
307
- student_progress = {}
308
-
309
- for student_id in df['user_id'].unique():
310
- student_msgs = df[df['user_id'] == student_id]
311
-
312
- # Calculate student-specific metrics
313
- progress = {
314
- 'total_messages': len(student_msgs),
315
- 'questions_asked': student_msgs['is_question'].sum(),
316
- 'confusion_instances': student_msgs['shows_confusion'].sum(),
317
- 'avg_sentiment': student_msgs['sentiment'].mean(),
318
- 'topic_engagement': {},
319
- 'learning_pattern': self._identify_learning_pattern(student_msgs)
320
- }
321
-
322
- # Analyze topic-specific engagement
323
- topics = self.extract_topic_hierarchies(student_msgs)
324
- for topic in topics:
325
- topic_msgs = student_msgs[student_msgs['prompt'].str.contains(topic, case=False)]
326
- progress['topic_engagement'][topic] = {
327
- 'message_count': len(topic_msgs),
328
- 'confusion_rate': topic_msgs['shows_confusion'].mean(),
329
- 'sentiment_trend': stats.linregress(
330
- range(len(topic_msgs)),
331
- topic_msgs['sentiment']
332
- ).slope
333
- }
334
-
335
- student_progress[student_id] = progress
336
-
337
- return student_progress
338
-
339
- def _identify_learning_pattern(self, student_msgs: pd.DataFrame) -> str:
340
- """Identify student's learning pattern based on their interaction style."""
341
- # Calculate key metrics
342
- question_ratio = student_msgs['is_question'].mean()
343
- confusion_ratio = student_msgs['shows_confusion'].mean()
344
- follow_up_ratio = student_msgs['is_followup'].mean()
345
- sentiment_trend = stats.linregress(
346
- range(len(student_msgs)),
347
- student_msgs['sentiment']
348
- ).slope
349
-
350
- # Identify pattern
351
- if question_ratio > 0.6:
352
- return "Inquisitive Learner"
353
- elif confusion_ratio > 0.4:
354
- return "Needs Additional Support"
355
- elif follow_up_ratio > 0.5:
356
- return "Deep Dive Learner"
357
- elif sentiment_trend > 0:
358
- return "Progressive Learner"
359
- else:
360
- return "Steady Learner"
361
-
362
- def generate_comprehensive_report(self, chat_history: List[Dict]) -> Dict[str, Any]:
363
- """Generate a comprehensive analytics report."""
364
- # Preprocess chat history
365
- df = self.preprocess_chat_history(chat_history)
366
-
367
- # Extract topics
368
- topics = self.extract_topic_hierarchies(df)
369
-
370
- report = {
371
- 'topics': {},
372
- 'student_progress': self.analyze_student_progress(df),
373
- 'overall_metrics': {
374
- 'total_conversations': len(df),
375
- 'unique_students': df['user_id'].nunique(),
376
- 'avg_sentiment': df['sentiment'].mean(),
377
- 'most_discussed_topics': Counter(
378
- topic for topics_list in topics.values()
379
- for topic in topics_list
380
- ).most_common(5)
381
- }
382
- }
383
-
384
- # Generate topic-specific insights
385
- for main_topic, subtopics in topics.items():
386
- subtopic_insights = {}
387
- for subtopic in subtopics:
388
- subtopic_insights[subtopic] = {
389
- 'insights': self.generate_topic_insights(df, subtopic),
390
- 'related_topics': [t for t in subtopics if t != subtopic],
391
- 'student_engagement': {
392
- student_id: self.calculate_engagement_metrics(
393
- df[df['user_id'] == student_id],
394
- subtopic
395
- )
396
- for student_id in df['user_id'].unique()
397
- }
398
- }
399
-
400
- report['topics'][main_topic] = {
401
- 'insights': self.generate_topic_insights(df, main_topic),
402
- 'subtopics': subtopic_insights,
403
- 'topic_relationships': {
404
- 'hierarchy_depth': len(subtopics),
405
- 'connection_strength': self._calculate_topic_connections(df, main_topic, subtopics),
406
- 'progression_path': self._identify_topic_progression(df, main_topic, subtopics)
407
- }
408
- }
409
-
410
- # Add temporal analysis
411
- report['temporal_analysis'] = {
412
- 'daily_engagement': df.groupby(df['timestamp'].dt.date).agg({
413
- 'user_id': 'count',
414
- 'is_question': 'sum',
415
- 'shows_confusion': 'sum',
416
- 'sentiment': 'mean'
417
- }).to_dict(),
418
- 'peak_activity_hours': df.groupby(df['timestamp'].dt.hour)['user_id'].count().nlargest(3).to_dict(),
419
- 'learning_trends': self._analyze_learning_trends(df)
420
- }
421
-
422
- # Add recommendations
423
- report['recommendations'] = self._generate_recommendations(report)
424
-
425
- return report
426
-
427
- def _calculate_topic_connections(self, df: pd.DataFrame, main_topic: str, subtopics: List[str]) -> Dict[str, float]:
428
- """Calculate connection strength between topics based on co-occurrence."""
429
- connections = {}
430
- main_topic_msgs = df[df['prompt'].str.contains(main_topic, case=False)]
431
-
432
- for subtopic in subtopics:
433
- cooccurrence = df[
434
- df['prompt'].str.contains(main_topic, case=False) &
435
- df['prompt'].str.contains(subtopic, case=False)
436
- ].shape[0]
437
-
438
- connection_strength = cooccurrence / len(main_topic_msgs) if len(main_topic_msgs) > 0 else 0
439
- connections[subtopic] = connection_strength
440
-
441
- return connections
442
-
443
- def _identify_topic_progression(self, df: pd.DataFrame, main_topic: str, subtopics: List[str]) -> List[str]:
444
- """Identify optimal topic progression path based on student interactions."""
445
- topic_difficulties = {}
446
-
447
- for subtopic in subtopics:
448
- difficulty = self.analyze_topic_difficulty(df, subtopic)
449
- topic_difficulties[subtopic] = difficulty.value
450
-
451
- # Sort subtopics by difficulty
452
- return sorted(subtopics, key=lambda x: topic_difficulties[x])
453
-
454
- def _analyze_learning_trends(self, df: pd.DataFrame) -> Dict[str, Any]:
455
- """Analyze overall learning trends across the dataset."""
456
- return {
457
- 'sentiment_trend': stats.linregress(
458
- range(len(df)),
459
- df['sentiment']
460
- )._asdict(),
461
- 'confusion_trend': stats.linregress(
462
- range(len(df)),
463
- df['shows_confusion']
464
- )._asdict(),
465
- 'engagement_progression': self._calculate_engagement_progression(df)
466
- }
467
-
468
- def _calculate_engagement_progression(self, df: pd.DataFrame) -> Dict[str, float]:
469
- """Calculate how student engagement changes over time."""
470
- df['week'] = df['timestamp'].dt.isocalendar().week
471
- weekly_engagement = df.groupby('week').agg({
472
- 'is_question': 'mean',
473
- 'shows_confusion': 'mean',
474
- 'is_followup': 'mean',
475
- 'sentiment': 'mean'
476
- })
477
-
478
- return {
479
- 'question_trend': stats.linregress(
480
- range(len(weekly_engagement)),
481
- weekly_engagement['is_question']
482
- ).slope,
483
- 'confusion_trend': stats.linregress(
484
- range(len(weekly_engagement)),
485
- weekly_engagement['shows_confusion']
486
- ).slope,
487
- 'follow_up_trend': stats.linregress(
488
- range(len(weekly_engagement)),
489
- weekly_engagement['is_followup']
490
- ).slope,
491
- 'sentiment_trend': stats.linregress(
492
- range(len(weekly_engagement)),
493
- weekly_engagement['sentiment']
494
- ).slope
495
- }
496
-
497
- def _generate_recommendations(self, report: Dict[str, Any]) -> List[str]:
498
- """Generate actionable recommendations based on the analysis."""
499
- recommendations = []
500
-
501
- # Analyze difficulty distribution
502
- difficult_topics = [
503
- topic for topic, data in report['topics'].items()
504
- if data['insights'].difficulty_level in
505
- (TopicDifficulty.DIFFICULT, TopicDifficulty.VERY_DIFFICULT)
506
- ]
507
-
508
- if difficult_topics:
509
- recommendations.append(
510
- f"Consider providing additional resources for challenging topics: {', '.join(difficult_topics)}"
511
- )
512
-
513
- # Analyze student engagement
514
- avg_engagement = np.mean([
515
- progress['questions_asked'] / progress['total_messages']
516
- for progress in report['student_progress'].values()
517
- ])
518
-
519
- if avg_engagement < 0.3:
520
- recommendations.append(
521
- "Implement more interactive elements to increase student engagement"
522
- )
523
-
524
- # Analyze temporal patterns
525
- peak_hours = list(report['temporal_analysis']['peak_activity_hours'].keys())
526
- recommendations.append(
527
- f"Consider scheduling additional support during peak activity hours: {peak_hours}"
528
- )
529
-
530
- # Analyze learning trends
531
- # sentiment_trend = report['temporal_analysis']['learning_trends']['sentiment_trend']
532
- # if sentiment_trend < 0:
533
- # recommendations.append(
534
- # "Review teaching approach to address declining student satisfaction"
535
- # )
536
- # Analyze learning trends
537
- # Analyze learning trends
538
- sentiment_trend = report.get('temporal_analysis', {}).get('learning_trends', {}).get('sentiment_trend', None)
539
- if isinstance(sentiment_trend, (int, float)):
540
- if sentiment_trend < 0:
541
- recommendations.append(
542
- "Review teaching approach to address declining student satisfaction"
543
- )
544
- elif isinstance(sentiment_trend, dict):
545
- # Handle the case where sentiment_trend is a dictionary
546
- print(f"Unexpected dict format for sentiment_trend: {sentiment_trend}")
547
- else:
548
- print(f"Unexpected type for sentiment_trend: {type(sentiment_trend)}")
549
-
550
- return recommendations
551
-
552
- class CustomJSONEncoder(json.JSONEncoder):
553
- def default(self, obj):
554
- if isinstance(obj, TopicDifficulty):
555
- return obj.value
556
- if isinstance(obj, TopicInsights):
557
- return obj.to_dict()
558
- if isinstance(obj, np.integer):
559
- return int(obj)
560
- if isinstance(obj, np.floating):
561
- return float(obj)
562
- if isinstance(obj, np.ndarray):
563
- return obj.tolist()
564
- if isinstance(obj, datetime):
565
- return obj.isoformat()
566
- return super().default(obj)
567
-
568
- def convert_insights_to_dict(report):
569
- for main_topic, data in report['topics'].items():
570
- if isinstance(data['insights'], TopicInsights):
571
- data['insights'] = data['insights'].to_dict()
572
- for subtopic, subdata in data['subtopics'].items():
573
- if isinstance(subdata['insights'], TopicInsights):
574
- subdata['insights'] = subdata['insights'].to_dict()
575
-
576
- if __name__ == "__main__":
577
- # Load chat history data
578
- chat_history = None
579
- with open('sample_files/chat_history_corpus.json', 'r', encoding="utf-8") as file:
580
- chat_history = json.load(file)
581
-
582
- # Initialize analytics system
583
- analytics = PreClassAnalytics()
584
-
585
- # Generate comprehensive report
586
- report = analytics.generate_comprehensive_report(chat_history)
587
-
588
- # Convert insights to dictionary
589
- # convert_insights_to_dict(report)
590
-
591
- print(json.dumps(report, indent=4, cls=CustomJSONEncoder))
592
- # print(report)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
session_page.py CHANGED
@@ -34,6 +34,7 @@ from bs4 import BeautifulSoup
34
  import streamlit.components.v1 as components
35
  from live_chat_feature import display_live_chat_interface
36
  from code_playground import display_code_playground
 
37
 
38
  # Load environment variables
39
  load_dotenv()
@@ -550,36 +551,107 @@ def display_preclass_content(session, student_id, course_id):
550
 
551
  import requests
552
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
553
  def fetch_youtube_video_title(video_url):
554
- """Fetch the title of a YouTube video using the YouTube Data API"""
 
 
555
  api_key = os.getenv("YOUTUBE_API_KEY")
 
 
 
 
 
556
  video_id = extract_youtube_id(video_url)
557
  if not video_id:
558
  return None
559
 
560
  url = f"https://www.googleapis.com/youtube/v3/videos?id={video_id}&key={api_key}&part=snippet"
561
- response = requests.get(url)
562
- if response.status_code == 200:
 
 
563
  data = response.json()
564
- if "items" in data and len(data["items"]) > 0:
565
- return data["items"][0]["snippet"]["title"]
566
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
567
 
568
  def upload_video_source(course_id, session_id, video_url):
569
- """Upload video source and its transcript to the database"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
570
  # Fetch video title
571
  video_title = fetch_youtube_video_title(video_url)
572
  if not video_title:
573
- st.error("Could not fetch the video title from the provided YouTube URL.")
574
- return
575
- # print("Video Title: ", video_title)
576
- # Extract transcript from YouTube video
577
  transcript = extract_youtube_transcript(video_url)
578
-
579
  if not transcript:
580
- st.error("Could not extract transcript from the provided YouTube URL.")
581
- return
582
-
583
  # Create resource document
584
  resource_data = {
585
  "_id": ObjectId(),
@@ -590,37 +662,89 @@ def upload_video_source(course_id, session_id, video_url):
590
  "text_content": transcript,
591
  "material_type": "video",
592
  "source_url": video_url,
593
- "uploaded_at": datetime.utcnow()
594
- }
 
 
595
  # Check if resource already exists
596
  existing_resource = resources_collection.find_one({
597
  "session_id": session_id,
598
- "source_url": video_url
599
  })
600
 
601
  if existing_resource:
602
- st.warning("This video resource already exists.")
 
 
 
 
 
 
603
  return existing_resource["_id"]
604
 
605
- # Insert new resource
606
- resources_collection.insert_one(resource_data)
607
- resource_id = resource_data["_id"]
608
-
609
- # Update course document
610
- courses_collection.update_one(
611
- {
612
- "course_id": course_id,
613
- "sessions.session_id": session_id
614
- },
615
- {
616
- "$push": {"sessions.$.pre_class.resources": resource_id}
617
- }
618
- )
619
- # Create vector store for the transcript
620
- create_vector_store(transcript, resource_id)
621
-
622
- # st.success("Video source uploaded successfully!")
623
- return resource_id
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
624
 
625
  def upload_preclass_materials(session_id, course_id):
626
  """Upload pre-class materials and manage external resources for a session"""
@@ -648,8 +772,8 @@ def upload_preclass_materials(session_id, course_id):
648
  if st.button("Upload Video"):
649
  with st.spinner("Processing video source..."):
650
  video_resource_id = upload_video_source(course_id, session_id, video_url)
651
- if video_resource_id:
652
- st.success("Video source uploaded successfully!")
653
 
654
  with external_tab:
655
  # Fetch and display external resources
@@ -717,7 +841,10 @@ def upload_preclass_materials(session_id, course_id):
717
  for material_type, resources in grouped_resources.items():
718
  st.markdown(f"##### {material_type.capitalize()} Resources")
719
  for material in resources:
720
- st.markdown(f"- **{material['file_name']}** ({material['file_type']})")
 
 
 
721
 
722
  def extract_external_content(url, content_type):
723
  """Extract content from external resources based on their type"""
@@ -731,18 +858,57 @@ def extract_external_content(url, content_type):
731
  return None
732
 
733
  def extract_youtube_transcript(url):
734
- """Extract transcript from YouTube videos"""
 
 
735
  try:
736
- # Extract video ID from URL
737
- video_id = url.split('v=')[1].split('&')[0]
738
-
739
- # Get transcript
740
- transcript = YouTubeTranscriptApi.get_transcript(video_id)
741
- # Combine transcript text
742
- full_text = ' '.join([entry['text'] for entry in transcript])
743
- return full_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
744
  except Exception as e:
745
- st.error(f"Could not extract YouTube transcript: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
746
  return None
747
 
748
  def extract_web_article(url):
@@ -813,17 +979,54 @@ def upload_external_resource(course_id, session_id, title, content, content_type
813
  return resource_id
814
 
815
  def extract_youtube_id(url):
816
- """Extract YouTube video ID from URL"""
817
- if 'youtube.com' in url:
818
- try:
819
- return url.split('v=')[1].split('&')[0]
820
- except IndexError:
821
- return None
822
- elif 'youtu.be' in url:
823
- try:
824
- return url.split('/')[-1]
825
- except IndexError:
826
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
827
  return None
828
 
829
  def display_live_presentation(session, user_type, course_id):
 
34
  import streamlit.components.v1 as components
35
  from live_chat_feature import display_live_chat_interface
36
  from code_playground import display_code_playground
37
+ from urllib.parse import urlparse, parse_qs
38
 
39
  # Load environment variables
40
  load_dotenv()
 
551
 
552
  import requests
553
 
554
+ def get_supported_url_formats():
555
+ """Return a list of supported URL formats for faculty reference"""
556
+ return """
557
+ Supported YouTube URL formats:
558
+ 1. Standard watch URL: https://www.youtube.com/watch?v=VIDEO_ID
559
+ 2. Short URL: https://youtu.be/VIDEO_ID
560
+ 3. Embed URL: https://www.youtube.com/embed/VIDEO_ID
561
+ 4. Mobile URL: https://m.youtube.com/watch?v=VIDEO_ID
562
+ 5. YouTube Shorts: https://www.youtube.com/shorts/VIDEO_ID
563
+
564
+ You can copy any of these formats from:
565
+ - YouTube website (Share button)
566
+ - YouTube mobile app (Share button)
567
+ - Browser address bar while watching the video
568
+ """
569
+
570
+
571
+ def display_url_guidance():
572
+ """Display guidance for faculty on how to get the correct URL"""
573
+ st.info("""
574
+ 📝 How to get the correct YouTube URL:
575
+ 1. Go to the YouTube video you want to share
576
+ 2. Click the 'Share' button below the video
577
+ 3. Copy the URL provided in the share dialog
578
+ 4. Paste it here
579
+
580
+ The URL should start with either 'youtube.com' or 'youtu.be'
581
+ """)
582
  def fetch_youtube_video_title(video_url):
583
+ """
584
+ Fetch the title of a YouTube video with detailed error handling
585
+ """
586
  api_key = os.getenv("YOUTUBE_API_KEY")
587
+ if not api_key:
588
+ st.error("⚠️ System Configuration Error: YouTube API key not configured.")
589
+ st.write("Please contact technical support for assistance.")
590
+ return None
591
+
592
  video_id = extract_youtube_id(video_url)
593
  if not video_id:
594
  return None
595
 
596
  url = f"https://www.googleapis.com/youtube/v3/videos?id={video_id}&key={api_key}&part=snippet"
597
+ try:
598
+ response = requests.get(url, timeout=10)
599
+ response.raise_for_status()
600
+
601
  data = response.json()
602
+ if not data.get("items"):
603
+ st.error("⚠️ Video not found or might be private.")
604
+ st.write("""
605
+ Please check if:
606
+ 1. The video is publicly available
607
+ 2. The URL is correct
608
+ 3. The video hasn't been deleted
609
+ """)
610
+ return None
611
+
612
+ return data["items"][0]["snippet"]["title"]
613
+
614
+ except requests.exceptions.RequestException as e:
615
+ if "quotaExceeded" in str(e):
616
+ st.error("⚠️ YouTube API quota exceeded.")
617
+ st.write("""
618
+ The system has reached its daily limit for video processing.
619
+ Please try:
620
+ 1. Waiting a few hours
621
+ 2. Trying again tomorrow
622
+ 3. Contact support if the issue persists
623
+ """)
624
+ else:
625
+ st.error(f"Error fetching video title: {str(e)}")
626
+ st.write("Please try again or choose a different video.")
627
+ return None
628
 
629
  def upload_video_source(course_id, session_id, video_url):
630
+ """
631
+ Upload video source and its transcript with comprehensive error handling
632
+ """
633
+ if not video_url:
634
+ st.error("Please provide a YouTube URL.")
635
+ display_url_guidance()
636
+ return None
637
+
638
+ # Display processing message
639
+ # with st.spinner("Processing your YouTube video..."):
640
+ # Validate video URL
641
+ video_id = extract_youtube_id(video_url)
642
+ if not video_id:
643
+ return None
644
+
645
  # Fetch video title
646
  video_title = fetch_youtube_video_title(video_url)
647
  if not video_title:
648
+ return None
649
+
650
+ # Extract transcript
 
651
  transcript = extract_youtube_transcript(video_url)
 
652
  if not transcript:
653
+ return None
654
+
 
655
  # Create resource document
656
  resource_data = {
657
  "_id": ObjectId(),
 
662
  "text_content": transcript,
663
  "material_type": "video",
664
  "source_url": video_url,
665
+ "uploaded_at": datetime.utcnow(),
666
+ "video_id": video_id
667
+ }
668
+
669
  # Check if resource already exists
670
  existing_resource = resources_collection.find_one({
671
  "session_id": session_id,
672
+ "video_id": video_id
673
  })
674
 
675
  if existing_resource:
676
+ st.warning("⚠️ This video has already been added to this session.")
677
+ st.write("""
678
+ Options:
679
+ 1. Choose a different video
680
+ 2. Use the existing video resource
681
+ 3. Remove the existing video first if you want to re-add it
682
+ """)
683
  return existing_resource["_id"]
684
 
685
+ try:
686
+ # Insert new resource
687
+ result = resources_collection.insert_one(resource_data)
688
+ resource_id = result.inserted_id
689
+
690
+ # Update course document
691
+ update_result = courses_collection.update_one(
692
+ {
693
+ "course_id": course_id,
694
+ "sessions.session_id": session_id
695
+ },
696
+ {
697
+ "$push": {"sessions.$.pre_class.resources": resource_id}
698
+ }
699
+ )
700
+
701
+ if update_result.modified_count == 0:
702
+ st.error("⚠️ Failed to update course with new resource.")
703
+ st.write("""
704
+ The video was processed but couldn't be added to the course.
705
+ This might be because:
706
+ 1. The course or session ID is invalid
707
+ 2. You don't have permission to modify this course
708
+ 3. There was a system error
709
+
710
+ Please try again or contact support if the issue persists.
711
+ """)
712
+ # Rollback resource insertion
713
+ resources_collection.delete_one({"_id": resource_id})
714
+ return None
715
+
716
+ # Create vector store for the transcript
717
+ # create_vector_store(transcript, resource_id)
718
+ # Create vector store for the transcript
719
+ vector_store_result = create_vector_store(transcript, resource_id)
720
+ if not vector_store_result:
721
+ st.error("⚠️ Failed to create vector store for the transcript.")
722
+ # Rollback insertions
723
+ resources_collection.delete_one({"_id": resource_id})
724
+ return None
725
+
726
+ st.success("✅ Video successfully added to your course!")
727
+ st.write(f"""
728
+ Added: "{video_title}"
729
+ You can now:
730
+ 1. Add more videos
731
+ 2. Preview the added video
732
+ 3. Continue building your course
733
+ """)
734
+ return resource_id
735
+
736
+ except Exception as e:
737
+ st.error("⚠️ Error uploading video source.")
738
+ st.write(f"""
739
+ There was an error while saving the video:
740
+ {str(e)}
741
+
742
+ Please:
743
+ 1. Try again
744
+ 2. Choose a different video
745
+ 3. Contact support if the issue persists
746
+ """)
747
+ return None
748
 
749
  def upload_preclass_materials(session_id, course_id):
750
  """Upload pre-class materials and manage external resources for a session"""
 
772
  if st.button("Upload Video"):
773
  with st.spinner("Processing video source..."):
774
  video_resource_id = upload_video_source(course_id, session_id, video_url)
775
+ # if video_resource_id:
776
+ # st.success("Video source uploaded successfully!")
777
 
778
  with external_tab:
779
  # Fetch and display external resources
 
841
  for material_type, resources in grouped_resources.items():
842
  st.markdown(f"##### {material_type.capitalize()} Resources")
843
  for material in resources:
844
+ resource_info = f"- **{material['file_name']}** ({material['file_type']})"
845
+ if 'source_url' in material:
846
+ resource_info += f" - [URL]({material['source_url']})"
847
+ st.markdown(resource_info)
848
 
849
  def extract_external_content(url, content_type):
850
  """Extract content from external resources based on their type"""
 
858
  return None
859
 
860
  def extract_youtube_transcript(url):
861
+ """
862
+ Extract transcript from YouTube videos with detailed error handling
863
+ """
864
  try:
865
+ video_id = extract_youtube_id(url)
866
+ if not video_id:
867
+ return None
868
+
869
+ # Get transcript with retries
870
+ max_retries = 3
871
+ for attempt in range(max_retries):
872
+ try:
873
+ transcript = YouTubeTranscriptApi.get_transcript(video_id)
874
+ # Combine transcript text with proper spacing and punctuation
875
+ full_text = ''
876
+ for entry in transcript:
877
+ text = entry['text'].strip()
878
+ if text:
879
+ if not full_text.endswith(('.', '!', '?', '..."')):
880
+ full_text += '. '
881
+ full_text += text + ' '
882
+ return full_text.strip()
883
+ except Exception as e:
884
+ if attempt == max_retries - 1:
885
+ raise e
886
+ continue
887
+
888
  except Exception as e:
889
+ error_message = str(e)
890
+ if "Video unavailable" in error_message:
891
+ st.error("⚠️ This video is unavailable or private. Please check if:")
892
+ st.write("""
893
+ - The video is set to public or unlisted
894
+ - The video hasn't been deleted
895
+ - You have the correct URL
896
+ """)
897
+ elif "Subtitles are disabled" in error_message:
898
+ st.error("⚠️ This video doesn't have subtitles/transcript available.")
899
+ st.write("""
900
+ Unfortunately, this video cannot be used because:
901
+ - It doesn't have closed captions or subtitles
902
+ - The creator hasn't enabled transcript generation
903
+
904
+ Please choose another video that has subtitles available.
905
+ You can check if a video has subtitles by:
906
+ 1. Playing the video on YouTube
907
+ 2. Clicking the 'CC' button in the video player
908
+ """)
909
+ else:
910
+ st.error(f"Could not extract YouTube transcript: {error_message}")
911
+ st.write("Please try again or choose a different video.")
912
  return None
913
 
914
  def extract_web_article(url):
 
979
  return resource_id
980
 
981
  def extract_youtube_id(url):
982
+ """
983
+ Extract YouTube video ID from various URL formats
984
+ """
985
+ if not url:
986
+ st.error("Please provide a YouTube URL.")
987
+ display_url_guidance()
988
+ return None
989
+
990
+ # Clean the URL
991
+ url = url.strip()
992
+
993
+ # Basic URL validation
994
+ if not ('youtube.com' in url or 'youtu.be' in url):
995
+ st.error("This doesn't appear to be a YouTube URL.")
996
+ st.write(get_supported_url_formats())
997
+ return None
998
+
999
+ # Try to extract using regex patterns
1000
+ patterns = [
1001
+ r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/v\/|youtube\.com\/e\/|youtube\.com\/shorts\/)([^&\n?#]+)',
1002
+ r'(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?\/\s]{11})'
1003
+ ]
1004
+
1005
+ for pattern in patterns:
1006
+ match = re.search(pattern, url)
1007
+ if match:
1008
+ video_id = match.group(1)
1009
+ if len(video_id) != 11: # YouTube IDs are always 11 characters
1010
+ st.error("Invalid YouTube video ID length. Please check your URL.")
1011
+ display_url_guidance()
1012
+ return None
1013
+ return video_id
1014
+
1015
+ # If regex fails, try parsing URL components
1016
+ try:
1017
+ parsed_url = urlparse(url)
1018
+ if 'youtube.com' in parsed_url.netloc:
1019
+ query_params = parse_qs(parsed_url.query)
1020
+ if 'v' in query_params:
1021
+ return query_params['v'][0]
1022
+ elif 'youtu.be' in parsed_url.netloc:
1023
+ return parsed_url.path.lstrip('/')
1024
+ except Exception:
1025
+ pass
1026
+
1027
+ # If all extraction methods fail
1028
+ st.error("Could not extract video ID from the provided URL.")
1029
+ st.write(get_supported_url_formats())
1030
  return None
1031
 
1032
  def display_live_presentation(session, user_type, course_id):