Harshal Vhatkar
commited on
Commit
·
7dd7d88
1
Parent(s):
e7247bf
improve video upload
Browse files- file_upload_vectorize.py +22 -8
- pre_class_analytics3.py +0 -499
- pre_class_analytics4.py +0 -592
- session_page.py +265 -62
file_upload_vectorize.py
CHANGED
@@ -141,12 +141,18 @@ def extract_text_from_file(uploaded_file):
|
|
141 |
st.error(f"Error processing file: {str(e)}")
|
142 |
return None
|
143 |
|
|
|
|
|
144 |
def get_embedding(text):
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
|
|
|
|
|
|
|
|
150 |
|
151 |
def create_vector_store(text, resource_id):
|
152 |
# resource_object_id = ObjectId(resource_id)
|
@@ -169,11 +175,19 @@ def create_vector_store(text, resource_id):
|
|
169 |
|
170 |
vector_data = {
|
171 |
"resource_id": resource_id,
|
172 |
-
"vector": embedding,
|
173 |
"text": text,
|
174 |
"created_at": datetime.utcnow()
|
175 |
}
|
176 |
|
177 |
-
vectors_collection.insert_one(vector_data)
|
178 |
|
179 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
st.error(f"Error processing file: {str(e)}")
|
142 |
return None
|
143 |
|
144 |
+
from sentence_transformers import SentenceTransformer
|
145 |
+
|
146 |
def get_embedding(text):
|
147 |
+
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
148 |
+
embeddings = model.encode(text)
|
149 |
+
# response = openai.embeddings.create(
|
150 |
+
# model="text-embedding-ada-002",
|
151 |
+
# input=text
|
152 |
+
# )
|
153 |
+
# return response.data[0].embedding
|
154 |
+
return embeddings
|
155 |
+
|
156 |
|
157 |
def create_vector_store(text, resource_id):
|
158 |
# resource_object_id = ObjectId(resource_id)
|
|
|
175 |
|
176 |
vector_data = {
|
177 |
"resource_id": resource_id,
|
178 |
+
"vector": embedding.tolist(),
|
179 |
"text": text,
|
180 |
"created_at": datetime.utcnow()
|
181 |
}
|
182 |
|
|
|
183 |
|
184 |
+
# vectors_collection.insert_one(vector_data)
|
185 |
+
# Store in MongoDB
|
186 |
+
try:
|
187 |
+
vectors_collection.insert_one(vector_data)
|
188 |
+
except Exception as db_error:
|
189 |
+
st.error(f"Database error: {str(db_error)}")
|
190 |
+
return None
|
191 |
+
|
192 |
+
# return VectorStoreIndex.from_documents([document])
|
193 |
+
return vector_data
|
pre_class_analytics3.py
DELETED
@@ -1,499 +0,0 @@
|
|
1 |
-
from datetime import datetime
|
2 |
-
import json
|
3 |
-
from bson import ObjectId
|
4 |
-
import typing_extensions as typing
|
5 |
-
import google.generativeai as genai
|
6 |
-
from typing import List, Dict, Any
|
7 |
-
import numpy as np
|
8 |
-
from collections import defaultdict
|
9 |
-
|
10 |
-
from dotenv import load_dotenv
|
11 |
-
import os
|
12 |
-
import pymongo
|
13 |
-
from pymongo import MongoClient
|
14 |
-
|
15 |
-
load_dotenv()
|
16 |
-
GEMINI_API_KEY = os.getenv("GEMINI_KEY")
|
17 |
-
|
18 |
-
class NovaScholarAnalytics:
|
19 |
-
def __init__(self, model_name: str = "gemini-1.5-flash"):
|
20 |
-
genai.configure(api_key=GEMINI_API_KEY)
|
21 |
-
self.model = genai.GenerativeModel(model_name)
|
22 |
-
|
23 |
-
def _preprocess_chat_histories(self, chat_histories: List[Dict]) -> List[Dict]:
|
24 |
-
# Code 2:
|
25 |
-
"""Preprocess chat histories to focus on relevant information."""
|
26 |
-
processed = []
|
27 |
-
|
28 |
-
for chat in chat_histories:
|
29 |
-
# Convert ObjectId to string if it's an ObjectId
|
30 |
-
user_id = str(chat["user_id"]["$oid"]) if isinstance(chat["user_id"], dict) and "$oid" in chat["user_id"] else str(chat["user_id"])
|
31 |
-
|
32 |
-
try:
|
33 |
-
processed_chat = {
|
34 |
-
"user_id": user_id,
|
35 |
-
"messages": [
|
36 |
-
{
|
37 |
-
"prompt": msg["prompt"],
|
38 |
-
"response": msg["response"]
|
39 |
-
}
|
40 |
-
for msg in chat["messages"]
|
41 |
-
]
|
42 |
-
}
|
43 |
-
processed.append(processed_chat)
|
44 |
-
print(f"Successfully processed chat for user: {user_id}")
|
45 |
-
except Exception as e:
|
46 |
-
print(f"Error processing chat for user: {user_id}")
|
47 |
-
print(f"Error details: {str(e)}")
|
48 |
-
continue
|
49 |
-
|
50 |
-
return processed
|
51 |
-
|
52 |
-
def _create_analytics_prompt(self, chat_histories: List[Dict], all_topics: List[str]) -> str:
|
53 |
-
"""Creates a structured prompt for Gemini to analyze chat histories."""
|
54 |
-
return f"""Analyze the provided student chat histories for a university course and generate concise, actionable analytics WITH EVIDENCE.
|
55 |
-
|
56 |
-
Context:
|
57 |
-
- Chat histories: {json.dumps(chat_histories, indent=2)}
|
58 |
-
- These are pre-class interactions between students and an AI tutor
|
59 |
-
- Topics covered: {', '.join(all_topics)}
|
60 |
-
|
61 |
-
Your task is to provide analytics with supporting evidence from the chat histories.
|
62 |
-
|
63 |
-
Output Format (strictly follow this JSON structure):
|
64 |
-
{{
|
65 |
-
"topic_wise_insights": [
|
66 |
-
{{
|
67 |
-
"topic": "<string>",
|
68 |
-
"struggling_percentage": <number between 0 and 1>,
|
69 |
-
"evidence": {{
|
70 |
-
"calculation": "Explain how struggling_percentage was calculated",
|
71 |
-
"supporting_messages": [
|
72 |
-
{{
|
73 |
-
"user_id": "<string>",
|
74 |
-
"message": "<string>",
|
75 |
-
"reasoning": "Why this message indicates struggling"
|
76 |
-
}}
|
77 |
-
]
|
78 |
-
}},
|
79 |
-
"key_issues": ["<string>"],
|
80 |
-
"key_misconceptions": ["<string>"],
|
81 |
-
"evidence_for_issues": [
|
82 |
-
{{
|
83 |
-
"issue": "<string>",
|
84 |
-
"supporting_messages": [
|
85 |
-
{{
|
86 |
-
"user_id": "<string>",
|
87 |
-
"message": "<string>"
|
88 |
-
}}
|
89 |
-
]
|
90 |
-
}}
|
91 |
-
]
|
92 |
-
}}
|
93 |
-
],
|
94 |
-
"ai_recommended_actions": [
|
95 |
-
{{
|
96 |
-
"action": "<string>",
|
97 |
-
"priority": "high|medium|low",
|
98 |
-
"reasoning": "<string>",
|
99 |
-
"evidence": {{
|
100 |
-
"supporting_messages": [
|
101 |
-
{{
|
102 |
-
"user_id": "<string>",
|
103 |
-
"message": "<string>",
|
104 |
-
"relevance": "Why this message supports the recommendation"
|
105 |
-
}}
|
106 |
-
],
|
107 |
-
"pattern_description": "Description of the pattern observed in chat histories"
|
108 |
-
}},
|
109 |
-
"expected_outcome": "<string>"
|
110 |
-
}}
|
111 |
-
],
|
112 |
-
"student_analytics": [
|
113 |
-
{{
|
114 |
-
"student_id": "<string>",
|
115 |
-
"engagement_metrics": {{
|
116 |
-
"participation_level": <number between 0 and 1>,
|
117 |
-
"concept_understanding": "strong|moderate|needs_improvement",
|
118 |
-
"question_quality": "advanced|intermediate|basic"
|
119 |
-
}},
|
120 |
-
"evidence": {{
|
121 |
-
"participation_calculation": "Explain how participation_level was calculated",
|
122 |
-
"understanding_evidence": [
|
123 |
-
{{
|
124 |
-
"message": "<string>",
|
125 |
-
"analysis": "Why this indicates their understanding level"
|
126 |
-
}}
|
127 |
-
],
|
128 |
-
"question_quality_evidence": [
|
129 |
-
{{
|
130 |
-
"question": "<string>",
|
131 |
-
"analysis": "Why this question is classified at this level"
|
132 |
-
}}
|
133 |
-
]
|
134 |
-
}},
|
135 |
-
"struggling_topics": ["<string>"],
|
136 |
-
"personalized_recommendation": "<string>"
|
137 |
-
}}
|
138 |
-
]
|
139 |
-
}}
|
140 |
-
|
141 |
-
Guidelines for Analysis:
|
142 |
-
1. For every insight, recommendation, or metric, provide specific evidence from the chat histories
|
143 |
-
2. Explain calculations (e.g., how struggling_percentage was derived)
|
144 |
-
3. Include relevant message excerpts that support each conclusion
|
145 |
-
4. For recommendations, show the pattern of student interactions that led to that recommendation
|
146 |
-
5. When analyzing question quality or understanding, provide reasoning for the classification
|
147 |
-
|
148 |
-
The response must adhere strictly to the above JSON structure, with all fields populated appropriately."""
|
149 |
-
|
150 |
-
def _validate_analytics_with_evidence(self, initial_analytics: Dict) -> Dict:
|
151 |
-
"""Validate the initial analytics by checking evidence."""
|
152 |
-
validation_prompt = f"""Review and validate the following analytics based on the provided evidence.
|
153 |
-
|
154 |
-
Analytics to validate: {json.dumps(initial_analytics, indent=2)}
|
155 |
-
|
156 |
-
For each section:
|
157 |
-
1. Verify if the evidence supports the conclusions
|
158 |
-
2. Check if calculations (percentages, metrics) are justified by the data
|
159 |
-
3. Validate if recommendations are supported by patterns in the chat history
|
160 |
-
|
161 |
-
Return a JSON with the same structure, but only include insights/recommendations that have strong supporting evidence.
|
162 |
-
For any removed items, include them in a separate "insufficient_evidence" section with explanation."""
|
163 |
-
|
164 |
-
try:
|
165 |
-
validation_response = self.model.generate_content(
|
166 |
-
validation_prompt,
|
167 |
-
generation_config=genai.GenerationConfig(
|
168 |
-
response_mime_type="application/json",
|
169 |
-
temperature=0.1
|
170 |
-
)
|
171 |
-
)
|
172 |
-
|
173 |
-
validated_analytics = json.loads(validation_response.text)
|
174 |
-
return validated_analytics
|
175 |
-
|
176 |
-
except Exception as e:
|
177 |
-
print(f"Error in validation: {str(e)}")
|
178 |
-
return initial_analytics
|
179 |
-
|
180 |
-
def _enrich_analytics(self, analytics: Dict) -> Dict:
|
181 |
-
"""Add derived insights and metrics to the validated analytics."""
|
182 |
-
try:
|
183 |
-
# Calculate class distribution
|
184 |
-
total_students = len(analytics.get("student_insights", []))
|
185 |
-
performance_distribution = defaultdict(int)
|
186 |
-
|
187 |
-
for student in analytics.get("student_insights", []):
|
188 |
-
metrics = student.get("engagement_metrics", {})
|
189 |
-
understanding = metrics.get("concept_understanding", "moderate")
|
190 |
-
|
191 |
-
if understanding == "strong":
|
192 |
-
performance_distribution["high_performers"] += 1
|
193 |
-
elif understanding == "needs_improvement":
|
194 |
-
performance_distribution["at_risk"] += 1
|
195 |
-
else:
|
196 |
-
performance_distribution["average_performers"] += 1
|
197 |
-
|
198 |
-
# Convert to percentages
|
199 |
-
class_distribution = {
|
200 |
-
level: count/total_students if total_students > 0 else 0
|
201 |
-
for level, count in performance_distribution.items()
|
202 |
-
}
|
203 |
-
|
204 |
-
# Calculate overall engagement
|
205 |
-
engagement_sum = sum(
|
206 |
-
student.get("engagement_metrics", {}).get("participation_level", 0)
|
207 |
-
for student in analytics.get("student_insights", [])
|
208 |
-
)
|
209 |
-
overall_engagement = engagement_sum / total_students if total_students > 0 else 0
|
210 |
-
|
211 |
-
# Identify critical topics (those with high struggling percentage)
|
212 |
-
critical_topics = [
|
213 |
-
topic["topic"]
|
214 |
-
for topic in analytics.get("topic_wise_insights", [])
|
215 |
-
if topic.get("struggling_percentage", 0) > 0.7 # 70% threshold
|
216 |
-
]
|
217 |
-
|
218 |
-
# Identify students needing intervention
|
219 |
-
immediate_attention = []
|
220 |
-
monitoring_required = []
|
221 |
-
|
222 |
-
for student in analytics.get("student_insights", []):
|
223 |
-
student_id = student.get("student_id")
|
224 |
-
metrics = student.get("engagement_metrics", {})
|
225 |
-
|
226 |
-
# Check for immediate attention needed
|
227 |
-
if (metrics.get("concept_understanding") == "needs_improvement" or
|
228 |
-
metrics.get("participation_level", 0) < 0.3 or # Less than 30% participation
|
229 |
-
len(student.get("struggling_topics", [])) > 2): # Struggling with more than 2 topics
|
230 |
-
immediate_attention.append(student_id)
|
231 |
-
# Check for monitoring
|
232 |
-
elif (metrics.get("concept_understanding") == "moderate" or
|
233 |
-
metrics.get("participation_level", 0) < 0.5): # Less than 50% participation
|
234 |
-
monitoring_required.append(student_id)
|
235 |
-
|
236 |
-
# Add enriched data to analytics
|
237 |
-
analytics["course_health"] = {
|
238 |
-
"overall_engagement": overall_engagement,
|
239 |
-
"critical_topics": critical_topics,
|
240 |
-
"class_distribution": class_distribution
|
241 |
-
}
|
242 |
-
|
243 |
-
analytics["intervention_metrics"] = {
|
244 |
-
"immediate_attention_needed": immediate_attention,
|
245 |
-
"monitoring_required": monitoring_required
|
246 |
-
}
|
247 |
-
|
248 |
-
# Add evidence for enriched metrics
|
249 |
-
analytics["course_health"]["evidence"] = {
|
250 |
-
"engagement_calculation": f"Calculated from average participation level of {total_students} students",
|
251 |
-
"critical_topics_criteria": "Topics where over 70% of students are struggling",
|
252 |
-
"distribution_calculation": "Based on concept understanding levels from student metrics"
|
253 |
-
}
|
254 |
-
|
255 |
-
analytics["intervention_metrics"]["evidence"] = {
|
256 |
-
"immediate_attention_criteria": "Students with low understanding, participation < 30%, or >2 struggling topics",
|
257 |
-
"monitoring_criteria": "Students with moderate understanding or participation < 50%"
|
258 |
-
}
|
259 |
-
|
260 |
-
return analytics
|
261 |
-
|
262 |
-
except Exception as e:
|
263 |
-
print(f"Error enriching analytics: {str(e)}")
|
264 |
-
return analytics # Return original analytics if enrichment fails
|
265 |
-
|
266 |
-
def generate_analytics(self, chat_histories: List[Dict], all_topics: List[str]) -> Dict:
|
267 |
-
"""Main method to generate analytics with evidence-based validation."""
|
268 |
-
try:
|
269 |
-
if not chat_histories or not all_topics:
|
270 |
-
print("Missing required input data")
|
271 |
-
return self._fallback_analytics()
|
272 |
-
|
273 |
-
try:
|
274 |
-
processed_histories = self._preprocess_chat_histories(chat_histories)
|
275 |
-
print("Successfully preprocessed chat histories")
|
276 |
-
except Exception as preprocess_error:
|
277 |
-
print(f"Error in preprocessing: {str(preprocess_error)}")
|
278 |
-
return self._fallback_analytics()
|
279 |
-
|
280 |
-
try:
|
281 |
-
prompt = self._create_analytics_prompt(processed_histories, all_topics)
|
282 |
-
print("Successfully created prompt")
|
283 |
-
print("Prompt preview:", prompt[:200] + "...") # Print first 200 chars
|
284 |
-
except Exception as prompt_error:
|
285 |
-
print(f"Error in prompt creation: {str(prompt_error)}")
|
286 |
-
return self._fallback_analytics()
|
287 |
-
|
288 |
-
# Generate initial analytics with evidence
|
289 |
-
# prompt = self._create_analytics_prompt(chat_histories, all_topics)
|
290 |
-
response = self.model.generate_content(
|
291 |
-
prompt,
|
292 |
-
generation_config=genai.GenerationConfig(
|
293 |
-
response_mime_type="application/json",
|
294 |
-
temperature=0.15
|
295 |
-
)
|
296 |
-
)
|
297 |
-
print(response.text)
|
298 |
-
|
299 |
-
if not response.text:
|
300 |
-
print("Empty response from Gemini")
|
301 |
-
return self._fallback_analytics()
|
302 |
-
|
303 |
-
# Parse initial analytics
|
304 |
-
# initial_analytics = self._process_gemini_response(response.text)
|
305 |
-
initial_analytics2 = json.loads(response.text)
|
306 |
-
print("Initial analytics:", initial_analytics2)
|
307 |
-
# print("Initial analytics type:", type(initial_analytics2))
|
308 |
-
# print("Moving to validation...")
|
309 |
-
|
310 |
-
# Validate analytics using evidence
|
311 |
-
validated_analytics = self._validate_analytics_with_evidence(initial_analytics2)
|
312 |
-
|
313 |
-
# # Enrich with additional metrics
|
314 |
-
final_analytics = self._enrich_analytics(validated_analytics)
|
315 |
-
|
316 |
-
return final_analytics
|
317 |
-
|
318 |
-
except Exception as e:
|
319 |
-
print(f"Error generating analytics: {str(e)}")
|
320 |
-
return self._fallback_analytics()
|
321 |
-
|
322 |
-
def _fallback_analytics(self) -> Dict:
|
323 |
-
"""Provide fallback analytics with explanation."""
|
324 |
-
return {
|
325 |
-
"topic_insights": [],
|
326 |
-
"student_insights": [],
|
327 |
-
"recommended_actions": [
|
328 |
-
{
|
329 |
-
"action": "Review analytics generation process",
|
330 |
-
"priority": "high",
|
331 |
-
"target_group": "system_administrators",
|
332 |
-
"reasoning": "Analytics generation failed",
|
333 |
-
"expected_impact": "Restore analytics functionality",
|
334 |
-
"evidence": {
|
335 |
-
"error": "Analytics generation failed to complete"
|
336 |
-
}
|
337 |
-
}
|
338 |
-
],
|
339 |
-
"course_health": {
|
340 |
-
"overall_engagement": 0,
|
341 |
-
"critical_topics": [],
|
342 |
-
"class_distribution": {
|
343 |
-
"high_performers": 0,
|
344 |
-
"average_performers": 0,
|
345 |
-
"at_risk": 0
|
346 |
-
}
|
347 |
-
},
|
348 |
-
"intervention_metrics": {
|
349 |
-
"immediate_attention_needed": [],
|
350 |
-
"monitoring_required": []
|
351 |
-
}
|
352 |
-
}
|
353 |
-
def _process_gemini_response(self, response: str) -> Dict:
|
354 |
-
print("Entered here")
|
355 |
-
try:
|
356 |
-
analytics = json.loads(response, object_hook=json_serializer)
|
357 |
-
if not isinstance(analytics, dict):
|
358 |
-
raise ValueError("Invalid response format")
|
359 |
-
return analytics
|
360 |
-
except Exception as e:
|
361 |
-
print(f"Error processing Gemini response: {str(e)}")
|
362 |
-
return self._fallback_analytics()
|
363 |
-
|
364 |
-
load_dotenv()
|
365 |
-
MONGODB_URI = os.getenv("MONGO_URI")
|
366 |
-
from file_upload_vectorize import model
|
367 |
-
import streamlit as st
|
368 |
-
|
369 |
-
def extract_topics_from_materials(session_id):
|
370 |
-
"""Extract topics from pre-class materials"""
|
371 |
-
materials = resources_collection.find({"session_id": session_id})
|
372 |
-
texts = ""
|
373 |
-
if materials:
|
374 |
-
for material in materials:
|
375 |
-
if 'text_content' in material:
|
376 |
-
text = material['text_content']
|
377 |
-
texts += text + "\n"
|
378 |
-
else:
|
379 |
-
st.warning("No text content found in the material.")
|
380 |
-
return
|
381 |
-
else:
|
382 |
-
st.error("No pre-class materials found for this session.")
|
383 |
-
return
|
384 |
-
|
385 |
-
if texts:
|
386 |
-
context_prompt = f"""
|
387 |
-
Task: Extract Comprehensive Topics in a List Format
|
388 |
-
You are tasked with analyzing the provided text content and extracting a detailed, flat list of topics.
|
389 |
-
|
390 |
-
Instructions:
|
391 |
-
Identify All Topics: Extract a comprehensive list of all topics, subtopics, and indirect topics present in the provided text content. This list should include:
|
392 |
-
|
393 |
-
Overarching themes
|
394 |
-
Main topics
|
395 |
-
Subtopics and their sub-subtopics
|
396 |
-
Indirectly related topics
|
397 |
-
Flat List Format: Provide a flat list where each item is a topic. Ensure topics at all levels (overarching, main, sub, sub-sub, indirect) are represented as individual entries in the list.
|
398 |
-
|
399 |
-
Be Exhaustive: Ensure the response captures every topic, subtopic, and indirectly related concept comprehensively.
|
400 |
-
|
401 |
-
Output Requirements:
|
402 |
-
Use this structure:
|
403 |
-
{{
|
404 |
-
"topics": [
|
405 |
-
"Topic 1",
|
406 |
-
"Topic 2",
|
407 |
-
"Topic 3",
|
408 |
-
...
|
409 |
-
]
|
410 |
-
}}
|
411 |
-
Do Not Include: Do not include backticks, hierarchical structures, or the word 'json' in your response.
|
412 |
-
|
413 |
-
Content to Analyze:
|
414 |
-
{texts}
|
415 |
-
"""
|
416 |
-
try:
|
417 |
-
# response = model.generate_content(context_prompt, generation_config=genai.GenerationConfig(response_mime_type="application/json", response_schema=list[Topics]))
|
418 |
-
response = model.generate_content(context_prompt, generation_config=genai.GenerationConfig(temperature=0.3))
|
419 |
-
if not response or not response.text:
|
420 |
-
st.error("Error extracting topics from materials.")
|
421 |
-
return
|
422 |
-
|
423 |
-
topics = response.text
|
424 |
-
return topics
|
425 |
-
except Exception as e:
|
426 |
-
st.error(f"Error extracting topics: {str(e)}")
|
427 |
-
return None
|
428 |
-
else:
|
429 |
-
st.error("No text content found in the pre-class materials.")
|
430 |
-
return None
|
431 |
-
|
432 |
-
|
433 |
-
def get_chat_history(user_id, session_id):
|
434 |
-
query = {
|
435 |
-
"user_id": ObjectId(user_id),
|
436 |
-
"session_id": session_id,
|
437 |
-
"timestamp": {"$lte": datetime.utcnow()}
|
438 |
-
}
|
439 |
-
result = chat_history_collection.find(query)
|
440 |
-
return list(result)
|
441 |
-
|
442 |
-
def json_serializer(obj):
|
443 |
-
if isinstance(obj, ObjectId):
|
444 |
-
return str(obj)
|
445 |
-
raise TypeError(f"Type {type(obj)} not serializable")
|
446 |
-
|
447 |
-
if __name__ == "__main__":
|
448 |
-
client = MongoClient(MONGODB_URI)
|
449 |
-
db = client["novascholar_db"]
|
450 |
-
chat_history_collection = db["chat_history"]
|
451 |
-
resources_collection = db["resources"]
|
452 |
-
session_id = "S104"
|
453 |
-
# Connect to MongoDB
|
454 |
-
user_ids = chat_history_collection.distinct("user_id", {"session_id": session_id})
|
455 |
-
# Debug print 2: Check user_ids
|
456 |
-
print("Found user_ids:", user_ids)
|
457 |
-
|
458 |
-
all_chat_histories = []
|
459 |
-
for user_id in user_ids:
|
460 |
-
result = get_chat_history(user_id, session_id)
|
461 |
-
# Debug print 3: Check each chat history result
|
462 |
-
print(f"Chat history for user {user_id}:", "Found" if result else "Not found")
|
463 |
-
if result:
|
464 |
-
for record in result:
|
465 |
-
chat_history = {
|
466 |
-
"user_id": record["user_id"], # Convert ObjectId to string
|
467 |
-
"session_id": record["session_id"],
|
468 |
-
"messages": record["messages"]
|
469 |
-
}
|
470 |
-
all_chat_histories.append(chat_history)
|
471 |
-
|
472 |
-
print(all_chat_histories)
|
473 |
-
|
474 |
-
# Export all chat histories to a JSON file
|
475 |
-
# Path: sample_files/chat_histories.json
|
476 |
-
# with open("sample_files/all_chat_histories3.json", "w") as file:
|
477 |
-
# json.dump(all_chat_histories, file, indent=2)
|
478 |
-
|
479 |
-
# Debug print 4: Check chat histories
|
480 |
-
print("Total chat histories collected:", len(all_chat_histories))
|
481 |
-
|
482 |
-
# Extract topics with debug print
|
483 |
-
# topics = extract_topics_from_materials(session_id)
|
484 |
-
# # Export extracted topics to a JSON file
|
485 |
-
# with open("sample_files/extracted_topics.json", "w") as file:
|
486 |
-
# json.dump(topics, file, indent=2)
|
487 |
-
|
488 |
-
# Load extracted topics from JSON file
|
489 |
-
with open("sample_files/extracted_topics.json", "r") as file:
|
490 |
-
topics = json.load(file)
|
491 |
-
# Debug print 5: Check topics
|
492 |
-
print("Extracted topics:", topics)
|
493 |
-
|
494 |
-
# Generate analytics
|
495 |
-
|
496 |
-
analytics_generator = NovaScholarAnalytics()
|
497 |
-
analytics = analytics_generator.generate_analytics(all_chat_histories, topics)
|
498 |
-
# Debug print 6: Check generated analytics
|
499 |
-
print("Generated Analytics:", analytics)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pre_class_analytics4.py
DELETED
@@ -1,592 +0,0 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
import numpy as np
|
3 |
-
from datetime import datetime
|
4 |
-
from typing import List, Dict, Any, Tuple
|
5 |
-
import spacy
|
6 |
-
from collections import Counter, defaultdict
|
7 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
8 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
9 |
-
from textblob import TextBlob
|
10 |
-
import networkx as nx
|
11 |
-
from scipy import stats
|
12 |
-
import logging
|
13 |
-
import json
|
14 |
-
from dataclasses import dataclass
|
15 |
-
from enum import Enum
|
16 |
-
|
17 |
-
# Configure logging
|
18 |
-
logging.basicConfig(level=logging.INFO)
|
19 |
-
logger = logging.getLogger(__name__)
|
20 |
-
|
21 |
-
class TopicDifficulty(Enum):
|
22 |
-
EASY = "easy"
|
23 |
-
MODERATE = "moderate"
|
24 |
-
DIFFICULT = "difficult"
|
25 |
-
VERY_DIFFICULT = "very_difficult"
|
26 |
-
|
27 |
-
|
28 |
-
@dataclass
|
29 |
-
class QuestionMetrics:
|
30 |
-
complexity_score: float
|
31 |
-
follow_up_count: int
|
32 |
-
clarification_count: int
|
33 |
-
time_spent: float
|
34 |
-
sentiment_score: float
|
35 |
-
|
36 |
-
@dataclass
|
37 |
-
class TopicInsights:
|
38 |
-
difficulty_level: TopicDifficulty
|
39 |
-
common_confusion_points: List[str]
|
40 |
-
question_patterns: List[str]
|
41 |
-
time_distribution: Dict[str, float]
|
42 |
-
engagement_metrics: Dict[str, float]
|
43 |
-
recommended_focus_areas: List[str]
|
44 |
-
|
45 |
-
def to_dict(self):
|
46 |
-
return {
|
47 |
-
"difficulty_level": self.difficulty_level.value, # Convert enum to its value
|
48 |
-
"common_confusion_points": self.common_confusion_points,
|
49 |
-
"question_patterns": self.question_patterns,
|
50 |
-
"time_distribution": {str(k): v for k, v in self.time_distribution.items()},
|
51 |
-
"engagement_metrics": self.engagement_metrics,
|
52 |
-
"recommended_focus_areas": self.recommended_focus_areas,
|
53 |
-
}
|
54 |
-
|
55 |
-
class PreClassAnalytics:
|
56 |
-
def __init__(self, nlp_model: str = "en_core_web_lg"):
|
57 |
-
"""Initialize the analytics system with necessary components."""
|
58 |
-
self.nlp = spacy.load(nlp_model)
|
59 |
-
self.question_indicators = {
|
60 |
-
"what", "why", "how", "when", "where", "which", "who",
|
61 |
-
"whose", "whom", "can", "could", "would", "will", "explain"
|
62 |
-
}
|
63 |
-
self.confusion_indicators = {
|
64 |
-
"confused", "don't understand", "unclear", "not clear",
|
65 |
-
"stuck", "difficult", "hard", "help", "explain again"
|
66 |
-
}
|
67 |
-
self.follow_up_indicators = {
|
68 |
-
"also", "another", "additionally", "furthermore", "moreover",
|
69 |
-
"besides", "related", "similarly", "again"
|
70 |
-
}
|
71 |
-
|
72 |
-
def preprocess_chat_history(self, chat_history: List[Dict]) -> pd.DataFrame:
|
73 |
-
"""Convert chat history to DataFrame with enhanced features."""
|
74 |
-
messages = []
|
75 |
-
for chat in chat_history:
|
76 |
-
user_id = chat['user_id']['$oid']
|
77 |
-
for msg in chat['messages']:
|
78 |
-
try:
|
79 |
-
# Ensure the timestamp is in the correct format
|
80 |
-
if isinstance(msg['timestamp'], dict) and '$date' in msg['timestamp']:
|
81 |
-
timestamp = pd.to_datetime(msg['timestamp']['$date'])
|
82 |
-
elif isinstance(msg['timestamp'], str):
|
83 |
-
timestamp = pd.to_datetime(msg['timestamp'])
|
84 |
-
else:
|
85 |
-
raise ValueError("Invalid timestamp format")
|
86 |
-
except Exception as e:
|
87 |
-
print(f"Error parsing timestamp: {msg['timestamp']}, error: {e}")
|
88 |
-
timestamp = pd.NaT # Use NaT (Not a Time) for invalid timestamps
|
89 |
-
|
90 |
-
messages.append({
|
91 |
-
'user_id': user_id,
|
92 |
-
'timestamp': timestamp,
|
93 |
-
'prompt': msg['prompt'],
|
94 |
-
'response': msg['response'],
|
95 |
-
'is_question': any(q in msg['prompt'].lower() for q in self.question_indicators),
|
96 |
-
'shows_confusion': any(c in msg['prompt'].lower() for c in self.confusion_indicators),
|
97 |
-
'is_followup': any(f in msg['prompt'].lower() for f in self.follow_up_indicators)
|
98 |
-
})
|
99 |
-
|
100 |
-
df = pd.DataFrame(messages)
|
101 |
-
df['sentiment'] = df['prompt'].apply(lambda x: TextBlob(x).sentiment.polarity)
|
102 |
-
return df
|
103 |
-
|
104 |
-
def extract_topic_hierarchies(self, df: pd.DataFrame) -> Dict[str, List[str]]:
|
105 |
-
"""Extract hierarchical topic relationships from conversations."""
|
106 |
-
topic_hierarchy = defaultdict(list)
|
107 |
-
|
108 |
-
for _, row in df.iterrows():
|
109 |
-
doc = self.nlp(row['prompt'])
|
110 |
-
|
111 |
-
# Extract main topics and subtopics using noun chunks and dependencies
|
112 |
-
main_topics = []
|
113 |
-
subtopics = []
|
114 |
-
|
115 |
-
for chunk in doc.noun_chunks:
|
116 |
-
if chunk.root.dep_ in ('nsubj', 'dobj'):
|
117 |
-
main_topics.append(chunk.text.lower())
|
118 |
-
else:
|
119 |
-
subtopics.append(chunk.text.lower())
|
120 |
-
|
121 |
-
# Build hierarchy
|
122 |
-
for main_topic in main_topics:
|
123 |
-
topic_hierarchy[main_topic].extend(subtopics)
|
124 |
-
|
125 |
-
# Clean and deduplicate
|
126 |
-
return {k: list(set(v)) for k, v in topic_hierarchy.items()}
|
127 |
-
|
128 |
-
def analyze_topic_difficulty(self, df: pd.DataFrame, topic: str) -> TopicDifficulty:
|
129 |
-
"""Determine topic difficulty based on various metrics."""
|
130 |
-
topic_msgs = df[df['prompt'].str.contains(topic, case=False)]
|
131 |
-
|
132 |
-
# Calculate difficulty indicators
|
133 |
-
confusion_rate = topic_msgs['shows_confusion'].mean()
|
134 |
-
question_rate = topic_msgs['is_question'].mean()
|
135 |
-
follow_up_rate = topic_msgs['is_followup'].mean()
|
136 |
-
avg_sentiment = topic_msgs['sentiment'].mean()
|
137 |
-
|
138 |
-
# Calculate composite difficulty score
|
139 |
-
difficulty_score = (
|
140 |
-
confusion_rate * 0.4 +
|
141 |
-
question_rate * 0.3 +
|
142 |
-
follow_up_rate * 0.2 +
|
143 |
-
(1 - (avg_sentiment + 1) / 2) * 0.1
|
144 |
-
)
|
145 |
-
|
146 |
-
# Map score to difficulty level
|
147 |
-
if difficulty_score < 0.3:
|
148 |
-
return TopicDifficulty.EASY
|
149 |
-
elif difficulty_score < 0.5:
|
150 |
-
return TopicDifficulty.MODERATE
|
151 |
-
elif difficulty_score < 0.7:
|
152 |
-
return TopicDifficulty.DIFFICULT
|
153 |
-
else:
|
154 |
-
return TopicDifficulty.VERY_DIFFICULT
|
155 |
-
|
156 |
-
def identify_confusion_patterns(self, df: pd.DataFrame, topic: str) -> List[str]:
|
157 |
-
"""Identify common patterns in student confusion."""
|
158 |
-
confused_msgs = df[
|
159 |
-
(df['prompt'].str.contains(topic, case=False)) &
|
160 |
-
(df['shows_confusion'])
|
161 |
-
]['prompt']
|
162 |
-
|
163 |
-
patterns = []
|
164 |
-
for msg in confused_msgs:
|
165 |
-
doc = self.nlp(msg)
|
166 |
-
|
167 |
-
# Extract key phrases around confusion indicators
|
168 |
-
for sent in doc.sents:
|
169 |
-
for token in sent:
|
170 |
-
if token.text.lower() in self.confusion_indicators:
|
171 |
-
# Get context window around confusion indicator
|
172 |
-
context = sent.text
|
173 |
-
patterns.append(context)
|
174 |
-
|
175 |
-
# Group similar patterns
|
176 |
-
if patterns:
|
177 |
-
vectorizer = TfidfVectorizer(ngram_range=(1, 3))
|
178 |
-
tfidf_matrix = vectorizer.fit_transform(patterns)
|
179 |
-
similarity_matrix = cosine_similarity(tfidf_matrix)
|
180 |
-
|
181 |
-
# Cluster similar patterns
|
182 |
-
G = nx.Graph()
|
183 |
-
for i in range(len(patterns)):
|
184 |
-
for j in range(i + 1, len(patterns)):
|
185 |
-
if similarity_matrix[i][j] > 0.5: # Similarity threshold
|
186 |
-
G.add_edge(i, j)
|
187 |
-
|
188 |
-
# Extract representative patterns from each cluster
|
189 |
-
clusters = list(nx.connected_components(G))
|
190 |
-
return [patterns[min(cluster)] for cluster in clusters]
|
191 |
-
|
192 |
-
return []
|
193 |
-
|
194 |
-
def analyze_question_patterns(self, df: pd.DataFrame, topic: str) -> List[str]:
|
195 |
-
"""Analyze patterns in student questions about the topic."""
|
196 |
-
topic_questions = df[
|
197 |
-
(df['prompt'].str.contains(topic, case=False)) &
|
198 |
-
(df['is_question'])
|
199 |
-
]['prompt']
|
200 |
-
|
201 |
-
question_types = defaultdict(list)
|
202 |
-
for question in topic_questions:
|
203 |
-
doc = self.nlp(question)
|
204 |
-
|
205 |
-
# Categorize questions
|
206 |
-
if any(token.text.lower() in {"what", "define", "explain"} for token in doc):
|
207 |
-
question_types["conceptual"].append(question)
|
208 |
-
elif any(token.text.lower() in {"how", "steps", "process"} for token in doc):
|
209 |
-
question_types["procedural"].append(question)
|
210 |
-
elif any(token.text.lower() in {"why", "reason", "because"} for token in doc):
|
211 |
-
question_types["reasoning"].append(question)
|
212 |
-
else:
|
213 |
-
question_types["other"].append(question)
|
214 |
-
|
215 |
-
# Extract patterns from each category
|
216 |
-
patterns = []
|
217 |
-
for category, questions in question_types.items():
|
218 |
-
if questions:
|
219 |
-
vectorizer = TfidfVectorizer(ngram_range=(1, 3))
|
220 |
-
tfidf_matrix = vectorizer.fit_transform(questions)
|
221 |
-
|
222 |
-
# Get most representative questions
|
223 |
-
feature_array = np.mean(tfidf_matrix.toarray(), axis=0)
|
224 |
-
tfidf_sorting = np.argsort(feature_array)[::-1]
|
225 |
-
features = vectorizer.get_feature_names_out()
|
226 |
-
|
227 |
-
patterns.append(f"{category}: {' '.join(features[tfidf_sorting[:3]])}")
|
228 |
-
|
229 |
-
return patterns
|
230 |
-
|
231 |
-
def analyze_time_distribution(self, df: pd.DataFrame, topic: str) -> Dict[str, float]:
|
232 |
-
"""Analyze time spent on different aspects of the topic."""
|
233 |
-
topic_msgs = df[df['prompt'].str.contains(topic, case=False)].copy()
|
234 |
-
if len(topic_msgs) < 2:
|
235 |
-
return {}
|
236 |
-
|
237 |
-
topic_msgs['time_diff'] = topic_msgs['timestamp'].diff()
|
238 |
-
|
239 |
-
# Calculate time distribution
|
240 |
-
distribution = {
|
241 |
-
'total_time': topic_msgs['time_diff'].sum().total_seconds() / 60,
|
242 |
-
'avg_time_per_message': topic_msgs['time_diff'].mean().total_seconds() / 60,
|
243 |
-
'max_time_gap': topic_msgs['time_diff'].max().total_seconds() / 60,
|
244 |
-
'time_spent_on_questions': topic_msgs[topic_msgs['is_question']]['time_diff'].sum().total_seconds() / 60,
|
245 |
-
'time_spent_on_confusion': topic_msgs[topic_msgs['shows_confusion']]['time_diff'].sum().total_seconds() / 60
|
246 |
-
}
|
247 |
-
|
248 |
-
return distribution
|
249 |
-
|
250 |
-
def calculate_engagement_metrics(self, df: pd.DataFrame, topic: str) -> Dict[str, float]:
|
251 |
-
"""Calculate student engagement metrics for the topic."""
|
252 |
-
topic_msgs = df[df['prompt'].str.contains(topic, case=False)]
|
253 |
-
|
254 |
-
metrics = {
|
255 |
-
'message_count': len(topic_msgs),
|
256 |
-
'question_ratio': topic_msgs['is_question'].mean(),
|
257 |
-
'confusion_ratio': topic_msgs['shows_confusion'].mean(),
|
258 |
-
'follow_up_ratio': topic_msgs['is_followup'].mean(),
|
259 |
-
'avg_sentiment': topic_msgs['sentiment'].mean(),
|
260 |
-
'engagement_score': 0.0 # Will be calculated below
|
261 |
-
}
|
262 |
-
|
263 |
-
# Calculate engagement score
|
264 |
-
metrics['engagement_score'] = (
|
265 |
-
metrics['message_count'] * 0.3 +
|
266 |
-
metrics['question_ratio'] * 0.25 +
|
267 |
-
metrics['follow_up_ratio'] * 0.25 +
|
268 |
-
(metrics['avg_sentiment'] + 1) / 2 * 0.2 # Normalize sentiment to 0-1
|
269 |
-
)
|
270 |
-
|
271 |
-
return metrics
|
272 |
-
|
273 |
-
def generate_topic_insights(self, df: pd.DataFrame, topic: str) -> TopicInsights:
|
274 |
-
"""Generate comprehensive insights for a topic."""
|
275 |
-
difficulty = self.analyze_topic_difficulty(df, topic)
|
276 |
-
confusion_points = self.identify_confusion_patterns(df, topic)
|
277 |
-
question_patterns = self.analyze_question_patterns(df, topic)
|
278 |
-
time_distribution = self.analyze_time_distribution(df, topic)
|
279 |
-
engagement_metrics = self.calculate_engagement_metrics(df, topic)
|
280 |
-
|
281 |
-
# Generate recommended focus areas based on insights
|
282 |
-
focus_areas = []
|
283 |
-
|
284 |
-
if difficulty in (TopicDifficulty.DIFFICULT, TopicDifficulty.VERY_DIFFICULT):
|
285 |
-
focus_areas.append("Fundamental concept reinforcement needed")
|
286 |
-
|
287 |
-
if confusion_points:
|
288 |
-
focus_areas.append(f"Address common confusion around: {', '.join(confusion_points[:3])}")
|
289 |
-
|
290 |
-
if engagement_metrics['confusion_ratio'] > 0.3:
|
291 |
-
focus_areas.append("Consider alternative teaching approaches")
|
292 |
-
|
293 |
-
if time_distribution.get('time_spent_on_questions', 0) > time_distribution.get('total_time', 0) * 0.5:
|
294 |
-
focus_areas.append("More practical examples or demonstrations needed")
|
295 |
-
|
296 |
-
return TopicInsights(
|
297 |
-
difficulty_level=difficulty,
|
298 |
-
common_confusion_points=confusion_points,
|
299 |
-
question_patterns=question_patterns,
|
300 |
-
time_distribution=time_distribution,
|
301 |
-
engagement_metrics=engagement_metrics,
|
302 |
-
recommended_focus_areas=focus_areas
|
303 |
-
)
|
304 |
-
|
305 |
-
def analyze_student_progress(self, df: pd.DataFrame) -> Dict[str, Any]:
|
306 |
-
"""Analyze individual student progress and learning patterns."""
|
307 |
-
student_progress = {}
|
308 |
-
|
309 |
-
for student_id in df['user_id'].unique():
|
310 |
-
student_msgs = df[df['user_id'] == student_id]
|
311 |
-
|
312 |
-
# Calculate student-specific metrics
|
313 |
-
progress = {
|
314 |
-
'total_messages': len(student_msgs),
|
315 |
-
'questions_asked': student_msgs['is_question'].sum(),
|
316 |
-
'confusion_instances': student_msgs['shows_confusion'].sum(),
|
317 |
-
'avg_sentiment': student_msgs['sentiment'].mean(),
|
318 |
-
'topic_engagement': {},
|
319 |
-
'learning_pattern': self._identify_learning_pattern(student_msgs)
|
320 |
-
}
|
321 |
-
|
322 |
-
# Analyze topic-specific engagement
|
323 |
-
topics = self.extract_topic_hierarchies(student_msgs)
|
324 |
-
for topic in topics:
|
325 |
-
topic_msgs = student_msgs[student_msgs['prompt'].str.contains(topic, case=False)]
|
326 |
-
progress['topic_engagement'][topic] = {
|
327 |
-
'message_count': len(topic_msgs),
|
328 |
-
'confusion_rate': topic_msgs['shows_confusion'].mean(),
|
329 |
-
'sentiment_trend': stats.linregress(
|
330 |
-
range(len(topic_msgs)),
|
331 |
-
topic_msgs['sentiment']
|
332 |
-
).slope
|
333 |
-
}
|
334 |
-
|
335 |
-
student_progress[student_id] = progress
|
336 |
-
|
337 |
-
return student_progress
|
338 |
-
|
339 |
-
def _identify_learning_pattern(self, student_msgs: pd.DataFrame) -> str:
|
340 |
-
"""Identify student's learning pattern based on their interaction style."""
|
341 |
-
# Calculate key metrics
|
342 |
-
question_ratio = student_msgs['is_question'].mean()
|
343 |
-
confusion_ratio = student_msgs['shows_confusion'].mean()
|
344 |
-
follow_up_ratio = student_msgs['is_followup'].mean()
|
345 |
-
sentiment_trend = stats.linregress(
|
346 |
-
range(len(student_msgs)),
|
347 |
-
student_msgs['sentiment']
|
348 |
-
).slope
|
349 |
-
|
350 |
-
# Identify pattern
|
351 |
-
if question_ratio > 0.6:
|
352 |
-
return "Inquisitive Learner"
|
353 |
-
elif confusion_ratio > 0.4:
|
354 |
-
return "Needs Additional Support"
|
355 |
-
elif follow_up_ratio > 0.5:
|
356 |
-
return "Deep Dive Learner"
|
357 |
-
elif sentiment_trend > 0:
|
358 |
-
return "Progressive Learner"
|
359 |
-
else:
|
360 |
-
return "Steady Learner"
|
361 |
-
|
362 |
-
def generate_comprehensive_report(self, chat_history: List[Dict]) -> Dict[str, Any]:
|
363 |
-
"""Generate a comprehensive analytics report."""
|
364 |
-
# Preprocess chat history
|
365 |
-
df = self.preprocess_chat_history(chat_history)
|
366 |
-
|
367 |
-
# Extract topics
|
368 |
-
topics = self.extract_topic_hierarchies(df)
|
369 |
-
|
370 |
-
report = {
|
371 |
-
'topics': {},
|
372 |
-
'student_progress': self.analyze_student_progress(df),
|
373 |
-
'overall_metrics': {
|
374 |
-
'total_conversations': len(df),
|
375 |
-
'unique_students': df['user_id'].nunique(),
|
376 |
-
'avg_sentiment': df['sentiment'].mean(),
|
377 |
-
'most_discussed_topics': Counter(
|
378 |
-
topic for topics_list in topics.values()
|
379 |
-
for topic in topics_list
|
380 |
-
).most_common(5)
|
381 |
-
}
|
382 |
-
}
|
383 |
-
|
384 |
-
# Generate topic-specific insights
|
385 |
-
for main_topic, subtopics in topics.items():
|
386 |
-
subtopic_insights = {}
|
387 |
-
for subtopic in subtopics:
|
388 |
-
subtopic_insights[subtopic] = {
|
389 |
-
'insights': self.generate_topic_insights(df, subtopic),
|
390 |
-
'related_topics': [t for t in subtopics if t != subtopic],
|
391 |
-
'student_engagement': {
|
392 |
-
student_id: self.calculate_engagement_metrics(
|
393 |
-
df[df['user_id'] == student_id],
|
394 |
-
subtopic
|
395 |
-
)
|
396 |
-
for student_id in df['user_id'].unique()
|
397 |
-
}
|
398 |
-
}
|
399 |
-
|
400 |
-
report['topics'][main_topic] = {
|
401 |
-
'insights': self.generate_topic_insights(df, main_topic),
|
402 |
-
'subtopics': subtopic_insights,
|
403 |
-
'topic_relationships': {
|
404 |
-
'hierarchy_depth': len(subtopics),
|
405 |
-
'connection_strength': self._calculate_topic_connections(df, main_topic, subtopics),
|
406 |
-
'progression_path': self._identify_topic_progression(df, main_topic, subtopics)
|
407 |
-
}
|
408 |
-
}
|
409 |
-
|
410 |
-
# Add temporal analysis
|
411 |
-
report['temporal_analysis'] = {
|
412 |
-
'daily_engagement': df.groupby(df['timestamp'].dt.date).agg({
|
413 |
-
'user_id': 'count',
|
414 |
-
'is_question': 'sum',
|
415 |
-
'shows_confusion': 'sum',
|
416 |
-
'sentiment': 'mean'
|
417 |
-
}).to_dict(),
|
418 |
-
'peak_activity_hours': df.groupby(df['timestamp'].dt.hour)['user_id'].count().nlargest(3).to_dict(),
|
419 |
-
'learning_trends': self._analyze_learning_trends(df)
|
420 |
-
}
|
421 |
-
|
422 |
-
# Add recommendations
|
423 |
-
report['recommendations'] = self._generate_recommendations(report)
|
424 |
-
|
425 |
-
return report
|
426 |
-
|
427 |
-
def _calculate_topic_connections(self, df: pd.DataFrame, main_topic: str, subtopics: List[str]) -> Dict[str, float]:
|
428 |
-
"""Calculate connection strength between topics based on co-occurrence."""
|
429 |
-
connections = {}
|
430 |
-
main_topic_msgs = df[df['prompt'].str.contains(main_topic, case=False)]
|
431 |
-
|
432 |
-
for subtopic in subtopics:
|
433 |
-
cooccurrence = df[
|
434 |
-
df['prompt'].str.contains(main_topic, case=False) &
|
435 |
-
df['prompt'].str.contains(subtopic, case=False)
|
436 |
-
].shape[0]
|
437 |
-
|
438 |
-
connection_strength = cooccurrence / len(main_topic_msgs) if len(main_topic_msgs) > 0 else 0
|
439 |
-
connections[subtopic] = connection_strength
|
440 |
-
|
441 |
-
return connections
|
442 |
-
|
443 |
-
def _identify_topic_progression(self, df: pd.DataFrame, main_topic: str, subtopics: List[str]) -> List[str]:
|
444 |
-
"""Identify optimal topic progression path based on student interactions."""
|
445 |
-
topic_difficulties = {}
|
446 |
-
|
447 |
-
for subtopic in subtopics:
|
448 |
-
difficulty = self.analyze_topic_difficulty(df, subtopic)
|
449 |
-
topic_difficulties[subtopic] = difficulty.value
|
450 |
-
|
451 |
-
# Sort subtopics by difficulty
|
452 |
-
return sorted(subtopics, key=lambda x: topic_difficulties[x])
|
453 |
-
|
454 |
-
def _analyze_learning_trends(self, df: pd.DataFrame) -> Dict[str, Any]:
|
455 |
-
"""Analyze overall learning trends across the dataset."""
|
456 |
-
return {
|
457 |
-
'sentiment_trend': stats.linregress(
|
458 |
-
range(len(df)),
|
459 |
-
df['sentiment']
|
460 |
-
)._asdict(),
|
461 |
-
'confusion_trend': stats.linregress(
|
462 |
-
range(len(df)),
|
463 |
-
df['shows_confusion']
|
464 |
-
)._asdict(),
|
465 |
-
'engagement_progression': self._calculate_engagement_progression(df)
|
466 |
-
}
|
467 |
-
|
468 |
-
def _calculate_engagement_progression(self, df: pd.DataFrame) -> Dict[str, float]:
|
469 |
-
"""Calculate how student engagement changes over time."""
|
470 |
-
df['week'] = df['timestamp'].dt.isocalendar().week
|
471 |
-
weekly_engagement = df.groupby('week').agg({
|
472 |
-
'is_question': 'mean',
|
473 |
-
'shows_confusion': 'mean',
|
474 |
-
'is_followup': 'mean',
|
475 |
-
'sentiment': 'mean'
|
476 |
-
})
|
477 |
-
|
478 |
-
return {
|
479 |
-
'question_trend': stats.linregress(
|
480 |
-
range(len(weekly_engagement)),
|
481 |
-
weekly_engagement['is_question']
|
482 |
-
).slope,
|
483 |
-
'confusion_trend': stats.linregress(
|
484 |
-
range(len(weekly_engagement)),
|
485 |
-
weekly_engagement['shows_confusion']
|
486 |
-
).slope,
|
487 |
-
'follow_up_trend': stats.linregress(
|
488 |
-
range(len(weekly_engagement)),
|
489 |
-
weekly_engagement['is_followup']
|
490 |
-
).slope,
|
491 |
-
'sentiment_trend': stats.linregress(
|
492 |
-
range(len(weekly_engagement)),
|
493 |
-
weekly_engagement['sentiment']
|
494 |
-
).slope
|
495 |
-
}
|
496 |
-
|
497 |
-
def _generate_recommendations(self, report: Dict[str, Any]) -> List[str]:
|
498 |
-
"""Generate actionable recommendations based on the analysis."""
|
499 |
-
recommendations = []
|
500 |
-
|
501 |
-
# Analyze difficulty distribution
|
502 |
-
difficult_topics = [
|
503 |
-
topic for topic, data in report['topics'].items()
|
504 |
-
if data['insights'].difficulty_level in
|
505 |
-
(TopicDifficulty.DIFFICULT, TopicDifficulty.VERY_DIFFICULT)
|
506 |
-
]
|
507 |
-
|
508 |
-
if difficult_topics:
|
509 |
-
recommendations.append(
|
510 |
-
f"Consider providing additional resources for challenging topics: {', '.join(difficult_topics)}"
|
511 |
-
)
|
512 |
-
|
513 |
-
# Analyze student engagement
|
514 |
-
avg_engagement = np.mean([
|
515 |
-
progress['questions_asked'] / progress['total_messages']
|
516 |
-
for progress in report['student_progress'].values()
|
517 |
-
])
|
518 |
-
|
519 |
-
if avg_engagement < 0.3:
|
520 |
-
recommendations.append(
|
521 |
-
"Implement more interactive elements to increase student engagement"
|
522 |
-
)
|
523 |
-
|
524 |
-
# Analyze temporal patterns
|
525 |
-
peak_hours = list(report['temporal_analysis']['peak_activity_hours'].keys())
|
526 |
-
recommendations.append(
|
527 |
-
f"Consider scheduling additional support during peak activity hours: {peak_hours}"
|
528 |
-
)
|
529 |
-
|
530 |
-
# Analyze learning trends
|
531 |
-
# sentiment_trend = report['temporal_analysis']['learning_trends']['sentiment_trend']
|
532 |
-
# if sentiment_trend < 0:
|
533 |
-
# recommendations.append(
|
534 |
-
# "Review teaching approach to address declining student satisfaction"
|
535 |
-
# )
|
536 |
-
# Analyze learning trends
|
537 |
-
# Analyze learning trends
|
538 |
-
sentiment_trend = report.get('temporal_analysis', {}).get('learning_trends', {}).get('sentiment_trend', None)
|
539 |
-
if isinstance(sentiment_trend, (int, float)):
|
540 |
-
if sentiment_trend < 0:
|
541 |
-
recommendations.append(
|
542 |
-
"Review teaching approach to address declining student satisfaction"
|
543 |
-
)
|
544 |
-
elif isinstance(sentiment_trend, dict):
|
545 |
-
# Handle the case where sentiment_trend is a dictionary
|
546 |
-
print(f"Unexpected dict format for sentiment_trend: {sentiment_trend}")
|
547 |
-
else:
|
548 |
-
print(f"Unexpected type for sentiment_trend: {type(sentiment_trend)}")
|
549 |
-
|
550 |
-
return recommendations
|
551 |
-
|
552 |
-
class CustomJSONEncoder(json.JSONEncoder):
|
553 |
-
def default(self, obj):
|
554 |
-
if isinstance(obj, TopicDifficulty):
|
555 |
-
return obj.value
|
556 |
-
if isinstance(obj, TopicInsights):
|
557 |
-
return obj.to_dict()
|
558 |
-
if isinstance(obj, np.integer):
|
559 |
-
return int(obj)
|
560 |
-
if isinstance(obj, np.floating):
|
561 |
-
return float(obj)
|
562 |
-
if isinstance(obj, np.ndarray):
|
563 |
-
return obj.tolist()
|
564 |
-
if isinstance(obj, datetime):
|
565 |
-
return obj.isoformat()
|
566 |
-
return super().default(obj)
|
567 |
-
|
568 |
-
def convert_insights_to_dict(report):
|
569 |
-
for main_topic, data in report['topics'].items():
|
570 |
-
if isinstance(data['insights'], TopicInsights):
|
571 |
-
data['insights'] = data['insights'].to_dict()
|
572 |
-
for subtopic, subdata in data['subtopics'].items():
|
573 |
-
if isinstance(subdata['insights'], TopicInsights):
|
574 |
-
subdata['insights'] = subdata['insights'].to_dict()
|
575 |
-
|
576 |
-
if __name__ == "__main__":
|
577 |
-
# Load chat history data
|
578 |
-
chat_history = None
|
579 |
-
with open('sample_files/chat_history_corpus.json', 'r', encoding="utf-8") as file:
|
580 |
-
chat_history = json.load(file)
|
581 |
-
|
582 |
-
# Initialize analytics system
|
583 |
-
analytics = PreClassAnalytics()
|
584 |
-
|
585 |
-
# Generate comprehensive report
|
586 |
-
report = analytics.generate_comprehensive_report(chat_history)
|
587 |
-
|
588 |
-
# Convert insights to dictionary
|
589 |
-
# convert_insights_to_dict(report)
|
590 |
-
|
591 |
-
print(json.dumps(report, indent=4, cls=CustomJSONEncoder))
|
592 |
-
# print(report)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
session_page.py
CHANGED
@@ -34,6 +34,7 @@ from bs4 import BeautifulSoup
|
|
34 |
import streamlit.components.v1 as components
|
35 |
from live_chat_feature import display_live_chat_interface
|
36 |
from code_playground import display_code_playground
|
|
|
37 |
|
38 |
# Load environment variables
|
39 |
load_dotenv()
|
@@ -550,36 +551,107 @@ def display_preclass_content(session, student_id, course_id):
|
|
550 |
|
551 |
import requests
|
552 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
553 |
def fetch_youtube_video_title(video_url):
|
554 |
-
"""
|
|
|
|
|
555 |
api_key = os.getenv("YOUTUBE_API_KEY")
|
|
|
|
|
|
|
|
|
|
|
556 |
video_id = extract_youtube_id(video_url)
|
557 |
if not video_id:
|
558 |
return None
|
559 |
|
560 |
url = f"https://www.googleapis.com/youtube/v3/videos?id={video_id}&key={api_key}&part=snippet"
|
561 |
-
|
562 |
-
|
|
|
|
|
563 |
data = response.json()
|
564 |
-
if
|
565 |
-
|
566 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
567 |
|
568 |
def upload_video_source(course_id, session_id, video_url):
|
569 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
570 |
# Fetch video title
|
571 |
video_title = fetch_youtube_video_title(video_url)
|
572 |
if not video_title:
|
573 |
-
|
574 |
-
|
575 |
-
#
|
576 |
-
# Extract transcript from YouTube video
|
577 |
transcript = extract_youtube_transcript(video_url)
|
578 |
-
|
579 |
if not transcript:
|
580 |
-
|
581 |
-
|
582 |
-
|
583 |
# Create resource document
|
584 |
resource_data = {
|
585 |
"_id": ObjectId(),
|
@@ -590,37 +662,89 @@ def upload_video_source(course_id, session_id, video_url):
|
|
590 |
"text_content": transcript,
|
591 |
"material_type": "video",
|
592 |
"source_url": video_url,
|
593 |
-
"uploaded_at": datetime.utcnow()
|
594 |
-
|
|
|
|
|
595 |
# Check if resource already exists
|
596 |
existing_resource = resources_collection.find_one({
|
597 |
"session_id": session_id,
|
598 |
-
"
|
599 |
})
|
600 |
|
601 |
if existing_resource:
|
602 |
-
st.warning("This video
|
|
|
|
|
|
|
|
|
|
|
|
|
603 |
return existing_resource["_id"]
|
604 |
|
605 |
-
|
606 |
-
|
607 |
-
|
608 |
-
|
609 |
-
|
610 |
-
|
611 |
-
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
-
|
617 |
-
|
618 |
-
|
619 |
-
|
620 |
-
|
621 |
-
|
622 |
-
|
623 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
624 |
|
625 |
def upload_preclass_materials(session_id, course_id):
|
626 |
"""Upload pre-class materials and manage external resources for a session"""
|
@@ -648,8 +772,8 @@ def upload_preclass_materials(session_id, course_id):
|
|
648 |
if st.button("Upload Video"):
|
649 |
with st.spinner("Processing video source..."):
|
650 |
video_resource_id = upload_video_source(course_id, session_id, video_url)
|
651 |
-
if video_resource_id:
|
652 |
-
|
653 |
|
654 |
with external_tab:
|
655 |
# Fetch and display external resources
|
@@ -717,7 +841,10 @@ def upload_preclass_materials(session_id, course_id):
|
|
717 |
for material_type, resources in grouped_resources.items():
|
718 |
st.markdown(f"##### {material_type.capitalize()} Resources")
|
719 |
for material in resources:
|
720 |
-
|
|
|
|
|
|
|
721 |
|
722 |
def extract_external_content(url, content_type):
|
723 |
"""Extract content from external resources based on their type"""
|
@@ -731,18 +858,57 @@ def extract_external_content(url, content_type):
|
|
731 |
return None
|
732 |
|
733 |
def extract_youtube_transcript(url):
|
734 |
-
"""
|
|
|
|
|
735 |
try:
|
736 |
-
|
737 |
-
|
738 |
-
|
739 |
-
|
740 |
-
transcript
|
741 |
-
|
742 |
-
|
743 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
744 |
except Exception as e:
|
745 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
746 |
return None
|
747 |
|
748 |
def extract_web_article(url):
|
@@ -813,17 +979,54 @@ def upload_external_resource(course_id, session_id, title, content, content_type
|
|
813 |
return resource_id
|
814 |
|
815 |
def extract_youtube_id(url):
|
816 |
-
"""
|
817 |
-
|
818 |
-
|
819 |
-
|
820 |
-
|
821 |
-
|
822 |
-
|
823 |
-
|
824 |
-
|
825 |
-
|
826 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
827 |
return None
|
828 |
|
829 |
def display_live_presentation(session, user_type, course_id):
|
|
|
34 |
import streamlit.components.v1 as components
|
35 |
from live_chat_feature import display_live_chat_interface
|
36 |
from code_playground import display_code_playground
|
37 |
+
from urllib.parse import urlparse, parse_qs
|
38 |
|
39 |
# Load environment variables
|
40 |
load_dotenv()
|
|
|
551 |
|
552 |
import requests
|
553 |
|
554 |
+
def get_supported_url_formats():
|
555 |
+
"""Return a list of supported URL formats for faculty reference"""
|
556 |
+
return """
|
557 |
+
Supported YouTube URL formats:
|
558 |
+
1. Standard watch URL: https://www.youtube.com/watch?v=VIDEO_ID
|
559 |
+
2. Short URL: https://youtu.be/VIDEO_ID
|
560 |
+
3. Embed URL: https://www.youtube.com/embed/VIDEO_ID
|
561 |
+
4. Mobile URL: https://m.youtube.com/watch?v=VIDEO_ID
|
562 |
+
5. YouTube Shorts: https://www.youtube.com/shorts/VIDEO_ID
|
563 |
+
|
564 |
+
You can copy any of these formats from:
|
565 |
+
- YouTube website (Share button)
|
566 |
+
- YouTube mobile app (Share button)
|
567 |
+
- Browser address bar while watching the video
|
568 |
+
"""
|
569 |
+
|
570 |
+
|
571 |
+
def display_url_guidance():
|
572 |
+
"""Display guidance for faculty on how to get the correct URL"""
|
573 |
+
st.info("""
|
574 |
+
📝 How to get the correct YouTube URL:
|
575 |
+
1. Go to the YouTube video you want to share
|
576 |
+
2. Click the 'Share' button below the video
|
577 |
+
3. Copy the URL provided in the share dialog
|
578 |
+
4. Paste it here
|
579 |
+
|
580 |
+
The URL should start with either 'youtube.com' or 'youtu.be'
|
581 |
+
""")
|
582 |
def fetch_youtube_video_title(video_url):
|
583 |
+
"""
|
584 |
+
Fetch the title of a YouTube video with detailed error handling
|
585 |
+
"""
|
586 |
api_key = os.getenv("YOUTUBE_API_KEY")
|
587 |
+
if not api_key:
|
588 |
+
st.error("⚠️ System Configuration Error: YouTube API key not configured.")
|
589 |
+
st.write("Please contact technical support for assistance.")
|
590 |
+
return None
|
591 |
+
|
592 |
video_id = extract_youtube_id(video_url)
|
593 |
if not video_id:
|
594 |
return None
|
595 |
|
596 |
url = f"https://www.googleapis.com/youtube/v3/videos?id={video_id}&key={api_key}&part=snippet"
|
597 |
+
try:
|
598 |
+
response = requests.get(url, timeout=10)
|
599 |
+
response.raise_for_status()
|
600 |
+
|
601 |
data = response.json()
|
602 |
+
if not data.get("items"):
|
603 |
+
st.error("⚠️ Video not found or might be private.")
|
604 |
+
st.write("""
|
605 |
+
Please check if:
|
606 |
+
1. The video is publicly available
|
607 |
+
2. The URL is correct
|
608 |
+
3. The video hasn't been deleted
|
609 |
+
""")
|
610 |
+
return None
|
611 |
+
|
612 |
+
return data["items"][0]["snippet"]["title"]
|
613 |
+
|
614 |
+
except requests.exceptions.RequestException as e:
|
615 |
+
if "quotaExceeded" in str(e):
|
616 |
+
st.error("⚠️ YouTube API quota exceeded.")
|
617 |
+
st.write("""
|
618 |
+
The system has reached its daily limit for video processing.
|
619 |
+
Please try:
|
620 |
+
1. Waiting a few hours
|
621 |
+
2. Trying again tomorrow
|
622 |
+
3. Contact support if the issue persists
|
623 |
+
""")
|
624 |
+
else:
|
625 |
+
st.error(f"Error fetching video title: {str(e)}")
|
626 |
+
st.write("Please try again or choose a different video.")
|
627 |
+
return None
|
628 |
|
629 |
def upload_video_source(course_id, session_id, video_url):
|
630 |
+
"""
|
631 |
+
Upload video source and its transcript with comprehensive error handling
|
632 |
+
"""
|
633 |
+
if not video_url:
|
634 |
+
st.error("Please provide a YouTube URL.")
|
635 |
+
display_url_guidance()
|
636 |
+
return None
|
637 |
+
|
638 |
+
# Display processing message
|
639 |
+
# with st.spinner("Processing your YouTube video..."):
|
640 |
+
# Validate video URL
|
641 |
+
video_id = extract_youtube_id(video_url)
|
642 |
+
if not video_id:
|
643 |
+
return None
|
644 |
+
|
645 |
# Fetch video title
|
646 |
video_title = fetch_youtube_video_title(video_url)
|
647 |
if not video_title:
|
648 |
+
return None
|
649 |
+
|
650 |
+
# Extract transcript
|
|
|
651 |
transcript = extract_youtube_transcript(video_url)
|
|
|
652 |
if not transcript:
|
653 |
+
return None
|
654 |
+
|
|
|
655 |
# Create resource document
|
656 |
resource_data = {
|
657 |
"_id": ObjectId(),
|
|
|
662 |
"text_content": transcript,
|
663 |
"material_type": "video",
|
664 |
"source_url": video_url,
|
665 |
+
"uploaded_at": datetime.utcnow(),
|
666 |
+
"video_id": video_id
|
667 |
+
}
|
668 |
+
|
669 |
# Check if resource already exists
|
670 |
existing_resource = resources_collection.find_one({
|
671 |
"session_id": session_id,
|
672 |
+
"video_id": video_id
|
673 |
})
|
674 |
|
675 |
if existing_resource:
|
676 |
+
st.warning("⚠️ This video has already been added to this session.")
|
677 |
+
st.write("""
|
678 |
+
Options:
|
679 |
+
1. Choose a different video
|
680 |
+
2. Use the existing video resource
|
681 |
+
3. Remove the existing video first if you want to re-add it
|
682 |
+
""")
|
683 |
return existing_resource["_id"]
|
684 |
|
685 |
+
try:
|
686 |
+
# Insert new resource
|
687 |
+
result = resources_collection.insert_one(resource_data)
|
688 |
+
resource_id = result.inserted_id
|
689 |
+
|
690 |
+
# Update course document
|
691 |
+
update_result = courses_collection.update_one(
|
692 |
+
{
|
693 |
+
"course_id": course_id,
|
694 |
+
"sessions.session_id": session_id
|
695 |
+
},
|
696 |
+
{
|
697 |
+
"$push": {"sessions.$.pre_class.resources": resource_id}
|
698 |
+
}
|
699 |
+
)
|
700 |
+
|
701 |
+
if update_result.modified_count == 0:
|
702 |
+
st.error("⚠️ Failed to update course with new resource.")
|
703 |
+
st.write("""
|
704 |
+
The video was processed but couldn't be added to the course.
|
705 |
+
This might be because:
|
706 |
+
1. The course or session ID is invalid
|
707 |
+
2. You don't have permission to modify this course
|
708 |
+
3. There was a system error
|
709 |
+
|
710 |
+
Please try again or contact support if the issue persists.
|
711 |
+
""")
|
712 |
+
# Rollback resource insertion
|
713 |
+
resources_collection.delete_one({"_id": resource_id})
|
714 |
+
return None
|
715 |
+
|
716 |
+
# Create vector store for the transcript
|
717 |
+
# create_vector_store(transcript, resource_id)
|
718 |
+
# Create vector store for the transcript
|
719 |
+
vector_store_result = create_vector_store(transcript, resource_id)
|
720 |
+
if not vector_store_result:
|
721 |
+
st.error("⚠️ Failed to create vector store for the transcript.")
|
722 |
+
# Rollback insertions
|
723 |
+
resources_collection.delete_one({"_id": resource_id})
|
724 |
+
return None
|
725 |
+
|
726 |
+
st.success("✅ Video successfully added to your course!")
|
727 |
+
st.write(f"""
|
728 |
+
Added: "{video_title}"
|
729 |
+
You can now:
|
730 |
+
1. Add more videos
|
731 |
+
2. Preview the added video
|
732 |
+
3. Continue building your course
|
733 |
+
""")
|
734 |
+
return resource_id
|
735 |
+
|
736 |
+
except Exception as e:
|
737 |
+
st.error("⚠️ Error uploading video source.")
|
738 |
+
st.write(f"""
|
739 |
+
There was an error while saving the video:
|
740 |
+
{str(e)}
|
741 |
+
|
742 |
+
Please:
|
743 |
+
1. Try again
|
744 |
+
2. Choose a different video
|
745 |
+
3. Contact support if the issue persists
|
746 |
+
""")
|
747 |
+
return None
|
748 |
|
749 |
def upload_preclass_materials(session_id, course_id):
|
750 |
"""Upload pre-class materials and manage external resources for a session"""
|
|
|
772 |
if st.button("Upload Video"):
|
773 |
with st.spinner("Processing video source..."):
|
774 |
video_resource_id = upload_video_source(course_id, session_id, video_url)
|
775 |
+
# if video_resource_id:
|
776 |
+
# st.success("Video source uploaded successfully!")
|
777 |
|
778 |
with external_tab:
|
779 |
# Fetch and display external resources
|
|
|
841 |
for material_type, resources in grouped_resources.items():
|
842 |
st.markdown(f"##### {material_type.capitalize()} Resources")
|
843 |
for material in resources:
|
844 |
+
resource_info = f"- **{material['file_name']}** ({material['file_type']})"
|
845 |
+
if 'source_url' in material:
|
846 |
+
resource_info += f" - [URL]({material['source_url']})"
|
847 |
+
st.markdown(resource_info)
|
848 |
|
849 |
def extract_external_content(url, content_type):
|
850 |
"""Extract content from external resources based on their type"""
|
|
|
858 |
return None
|
859 |
|
860 |
def extract_youtube_transcript(url):
|
861 |
+
"""
|
862 |
+
Extract transcript from YouTube videos with detailed error handling
|
863 |
+
"""
|
864 |
try:
|
865 |
+
video_id = extract_youtube_id(url)
|
866 |
+
if not video_id:
|
867 |
+
return None
|
868 |
+
|
869 |
+
# Get transcript with retries
|
870 |
+
max_retries = 3
|
871 |
+
for attempt in range(max_retries):
|
872 |
+
try:
|
873 |
+
transcript = YouTubeTranscriptApi.get_transcript(video_id)
|
874 |
+
# Combine transcript text with proper spacing and punctuation
|
875 |
+
full_text = ''
|
876 |
+
for entry in transcript:
|
877 |
+
text = entry['text'].strip()
|
878 |
+
if text:
|
879 |
+
if not full_text.endswith(('.', '!', '?', '..."')):
|
880 |
+
full_text += '. '
|
881 |
+
full_text += text + ' '
|
882 |
+
return full_text.strip()
|
883 |
+
except Exception as e:
|
884 |
+
if attempt == max_retries - 1:
|
885 |
+
raise e
|
886 |
+
continue
|
887 |
+
|
888 |
except Exception as e:
|
889 |
+
error_message = str(e)
|
890 |
+
if "Video unavailable" in error_message:
|
891 |
+
st.error("⚠️ This video is unavailable or private. Please check if:")
|
892 |
+
st.write("""
|
893 |
+
- The video is set to public or unlisted
|
894 |
+
- The video hasn't been deleted
|
895 |
+
- You have the correct URL
|
896 |
+
""")
|
897 |
+
elif "Subtitles are disabled" in error_message:
|
898 |
+
st.error("⚠️ This video doesn't have subtitles/transcript available.")
|
899 |
+
st.write("""
|
900 |
+
Unfortunately, this video cannot be used because:
|
901 |
+
- It doesn't have closed captions or subtitles
|
902 |
+
- The creator hasn't enabled transcript generation
|
903 |
+
|
904 |
+
Please choose another video that has subtitles available.
|
905 |
+
You can check if a video has subtitles by:
|
906 |
+
1. Playing the video on YouTube
|
907 |
+
2. Clicking the 'CC' button in the video player
|
908 |
+
""")
|
909 |
+
else:
|
910 |
+
st.error(f"Could not extract YouTube transcript: {error_message}")
|
911 |
+
st.write("Please try again or choose a different video.")
|
912 |
return None
|
913 |
|
914 |
def extract_web_article(url):
|
|
|
979 |
return resource_id
|
980 |
|
981 |
def extract_youtube_id(url):
|
982 |
+
"""
|
983 |
+
Extract YouTube video ID from various URL formats
|
984 |
+
"""
|
985 |
+
if not url:
|
986 |
+
st.error("Please provide a YouTube URL.")
|
987 |
+
display_url_guidance()
|
988 |
+
return None
|
989 |
+
|
990 |
+
# Clean the URL
|
991 |
+
url = url.strip()
|
992 |
+
|
993 |
+
# Basic URL validation
|
994 |
+
if not ('youtube.com' in url or 'youtu.be' in url):
|
995 |
+
st.error("This doesn't appear to be a YouTube URL.")
|
996 |
+
st.write(get_supported_url_formats())
|
997 |
+
return None
|
998 |
+
|
999 |
+
# Try to extract using regex patterns
|
1000 |
+
patterns = [
|
1001 |
+
r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/v\/|youtube\.com\/e\/|youtube\.com\/shorts\/)([^&\n?#]+)',
|
1002 |
+
r'(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?\/\s]{11})'
|
1003 |
+
]
|
1004 |
+
|
1005 |
+
for pattern in patterns:
|
1006 |
+
match = re.search(pattern, url)
|
1007 |
+
if match:
|
1008 |
+
video_id = match.group(1)
|
1009 |
+
if len(video_id) != 11: # YouTube IDs are always 11 characters
|
1010 |
+
st.error("Invalid YouTube video ID length. Please check your URL.")
|
1011 |
+
display_url_guidance()
|
1012 |
+
return None
|
1013 |
+
return video_id
|
1014 |
+
|
1015 |
+
# If regex fails, try parsing URL components
|
1016 |
+
try:
|
1017 |
+
parsed_url = urlparse(url)
|
1018 |
+
if 'youtube.com' in parsed_url.netloc:
|
1019 |
+
query_params = parse_qs(parsed_url.query)
|
1020 |
+
if 'v' in query_params:
|
1021 |
+
return query_params['v'][0]
|
1022 |
+
elif 'youtu.be' in parsed_url.netloc:
|
1023 |
+
return parsed_url.path.lstrip('/')
|
1024 |
+
except Exception:
|
1025 |
+
pass
|
1026 |
+
|
1027 |
+
# If all extraction methods fail
|
1028 |
+
st.error("Could not extract video ID from the provided URL.")
|
1029 |
+
st.write(get_supported_url_formats())
|
1030 |
return None
|
1031 |
|
1032 |
def display_live_presentation(session, user_type, course_id):
|