Spaces:

SPJIMR-Internship
/

SPJIMR_FlipClassroom_RCopilot_ResearchInternship

Running

App Files Files Community

Harshal Vhatkar commited on Feb 12

Commit

7dd7d88

1 Parent(s): e7247bf

improve video upload

Browse files

Files changed (4) hide show

file_upload_vectorize.py +22 -8
pre_class_analytics3.py +0 -499
pre_class_analytics4.py +0 -592
session_page.py +265 -62

file_upload_vectorize.py CHANGED Viewed

@@ -141,12 +141,18 @@ def extract_text_from_file(uploaded_file):
         st.error(f"Error processing file: {str(e)}")
         return None
 def get_embedding(text):
-    response = openai.embeddings.create(
-        model="text-embedding-ada-002",
-        input=text
-    )
-    return response.data[0].embedding
 def create_vector_store(text, resource_id):
     # resource_object_id = ObjectId(resource_id)
@@ -169,11 +175,19 @@ def create_vector_store(text, resource_id):
     vector_data = {
         "resource_id": resource_id,
-        "vector": embedding,
         "text": text,
         "created_at": datetime.utcnow()
     }
-    vectors_collection.insert_one(vector_data)
-    # return VectorStoreIndex.from_documents([document])

         st.error(f"Error processing file: {str(e)}")
         return None
+from sentence_transformers import SentenceTransformer
 def get_embedding(text):
+    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+    embeddings = model.encode(text)
+    # response = openai.embeddings.create(
+    #     model="text-embedding-ada-002",
+    #     input=text
+    # )
+    # return response.data[0].embedding
+    return embeddings
 def create_vector_store(text, resource_id):
     # resource_object_id = ObjectId(resource_id)
     vector_data = {
         "resource_id": resource_id,
+        "vector": embedding.tolist(),
         "text": text,
         "created_at": datetime.utcnow()
     }
+    # vectors_collection.insert_one(vector_data)
+    # Store in MongoDB
+    try:
+        vectors_collection.insert_one(vector_data)
+    except Exception as db_error:
+        st.error(f"Database error: {str(db_error)}")
+        return None
+    # return VectorStoreIndex.from_documents([document])
+    return vector_data

pre_class_analytics3.py DELETED Viewed

@@ -1,499 +0,0 @@
-from datetime import datetime
-import json
-from bson import ObjectId
-import typing_extensions as typing
-import google.generativeai as genai
-from typing import List, Dict, Any
-import numpy as np
-from collections import defaultdict
-from dotenv import load_dotenv
-import os
-import pymongo
-from pymongo import MongoClient
-load_dotenv()
-GEMINI_API_KEY = os.getenv("GEMINI_KEY")
-class NovaScholarAnalytics:
-    def __init__(self, model_name: str = "gemini-1.5-flash"):
-        genai.configure(api_key=GEMINI_API_KEY)
-        self.model = genai.GenerativeModel(model_name)
-    def _preprocess_chat_histories(self, chat_histories: List[Dict]) -> List[Dict]:
-        # Code 2:
-        """Preprocess chat histories to focus on relevant information."""
-        processed = []
-        for chat in chat_histories:
-            # Convert ObjectId to string if it's an ObjectId
-            user_id = str(chat["user_id"]["$oid"]) if isinstance(chat["user_id"], dict) and "$oid" in chat["user_id"] else str(chat["user_id"])
-            try:
-                processed_chat = {
-                    "user_id": user_id,
-                    "messages": [
-                        {
-                            "prompt": msg["prompt"],
-                            "response": msg["response"]
-                        }
-                        for msg in chat["messages"]
-                    ]
-                }
-                processed.append(processed_chat)
-                print(f"Successfully processed chat for user: {user_id}")
-            except Exception as e:
-                print(f"Error processing chat for user: {user_id}")
-                print(f"Error details: {str(e)}")
-                continue
-        return processed
-    def _create_analytics_prompt(self, chat_histories: List[Dict], all_topics: List[str]) -> str:
-        """Creates a structured prompt for Gemini to analyze chat histories."""
-        return f"""Analyze the provided student chat histories for a university course and generate concise, actionable analytics WITH EVIDENCE.
-        Context:
-        - Chat histories: {json.dumps(chat_histories, indent=2)}
-        - These are pre-class interactions between students and an AI tutor
-        - Topics covered: {', '.join(all_topics)}
-        Your task is to provide analytics with supporting evidence from the chat histories.
-        Output Format (strictly follow this JSON structure):
-        {{
-        "topic_wise_insights": [
-            {{
-            "topic": "<string>",
-            "struggling_percentage": <number between 0 and 1>,
-            "evidence": {{
-                "calculation": "Explain how struggling_percentage was calculated",
-                "supporting_messages": [
-                    {{
-                        "user_id": "<string>",
-                        "message": "<string>",
-                        "reasoning": "Why this message indicates struggling"
-                    }}
-                ]
-            }},
-            "key_issues": ["<string>"],
-            "key_misconceptions": ["<string>"],
-            "evidence_for_issues": [
-                {{
-                    "issue": "<string>",
-                    "supporting_messages": [
-                        {{
-                            "user_id": "<string>",
-                            "message": "<string>"
-                        }}
-                    ]
-                }}
-            ]
-            }}
-        ],
-        "ai_recommended_actions": [
-            {{
-            "action": "<string>",
-            "priority": "high|medium|low",
-            "reasoning": "<string>",
-            "evidence": {{
-                "supporting_messages": [
-                    {{
-                        "user_id": "<string>",
-                        "message": "<string>",
-                        "relevance": "Why this message supports the recommendation"
-                    }}
-                ],
-                "pattern_description": "Description of the pattern observed in chat histories"
-            }},
-            "expected_outcome": "<string>"
-            }}
-        ],
-        "student_analytics": [
-            {{
-            "student_id": "<string>",
-            "engagement_metrics": {{
-                "participation_level": <number between 0 and 1>,
-                "concept_understanding": "strong|moderate|needs_improvement",
-                "question_quality": "advanced|intermediate|basic"
-            }},
-            "evidence": {{
-                "participation_calculation": "Explain how participation_level was calculated",
-                "understanding_evidence": [
-                    {{
-                        "message": "<string>",
-                        "analysis": "Why this indicates their understanding level"
-                    }}
-                ],
-                "question_quality_evidence": [
-                    {{
-                        "question": "<string>",
-                        "analysis": "Why this question is classified at this level"
-                    }}
-                ]
-            }},
-            "struggling_topics": ["<string>"],
-            "personalized_recommendation": "<string>"
-            }}
-        ]
-        }}
-        Guidelines for Analysis:
-        1. For every insight, recommendation, or metric, provide specific evidence from the chat histories
-        2. Explain calculations (e.g., how struggling_percentage was derived)
-        3. Include relevant message excerpts that support each conclusion
-        4. For recommendations, show the pattern of student interactions that led to that recommendation
-        5. When analyzing question quality or understanding, provide reasoning for the classification
-        The response must adhere strictly to the above JSON structure, with all fields populated appropriately."""
-    def _validate_analytics_with_evidence(self, initial_analytics: Dict) -> Dict:
-        """Validate the initial analytics by checking evidence."""
-        validation_prompt = f"""Review and validate the following analytics based on the provided evidence.
-        Analytics to validate: {json.dumps(initial_analytics, indent=2)}
-        For each section:
-        1. Verify if the evidence supports the conclusions
-        2. Check if calculations (percentages, metrics) are justified by the data
-        3. Validate if recommendations are supported by patterns in the chat history
-        Return a JSON with the same structure, but only include insights/recommendations that have strong supporting evidence.
-        For any removed items, include them in a separate "insufficient_evidence" section with explanation."""
-        try:
-            validation_response = self.model.generate_content(
-                validation_prompt,
-                generation_config=genai.GenerationConfig(
-                    response_mime_type="application/json",
-                    temperature=0.1
-                )
-            )
-            validated_analytics = json.loads(validation_response.text)
-            return validated_analytics
-        except Exception as e:
-            print(f"Error in validation: {str(e)}")
-            return initial_analytics
-    def _enrich_analytics(self, analytics: Dict) -> Dict:
-        """Add derived insights and metrics to the validated analytics."""
-        try:
-            # Calculate class distribution
-            total_students = len(analytics.get("student_insights", []))
-            performance_distribution = defaultdict(int)
-            for student in analytics.get("student_insights", []):
-                metrics = student.get("engagement_metrics", {})
-                understanding = metrics.get("concept_understanding", "moderate")
-                if understanding == "strong":
-                    performance_distribution["high_performers"] += 1
-                elif understanding == "needs_improvement":
-                    performance_distribution["at_risk"] += 1
-                else:
-                    performance_distribution["average_performers"] += 1
-            # Convert to percentages
-            class_distribution = {
-                level: count/total_students if total_students > 0 else 0
-                for level, count in performance_distribution.items()
-            }
-            # Calculate overall engagement
-            engagement_sum = sum(
-                student.get("engagement_metrics", {}).get("participation_level", 0)
-                for student in analytics.get("student_insights", [])
-            )
-            overall_engagement = engagement_sum / total_students if total_students > 0 else 0
-            # Identify critical topics (those with high struggling percentage)
-            critical_topics = [
-                topic["topic"]
-                for topic in analytics.get("topic_wise_insights", [])
-                if topic.get("struggling_percentage", 0) > 0.7  # 70% threshold
-            ]
-            # Identify students needing intervention
-            immediate_attention = []
-            monitoring_required = []
-            for student in analytics.get("student_insights", []):
-                student_id = student.get("student_id")
-                metrics = student.get("engagement_metrics", {})
-                # Check for immediate attention needed
-                if (metrics.get("concept_understanding") == "needs_improvement" or
-                    metrics.get("participation_level", 0) < 0.3 or  # Less than 30% participation
-                    len(student.get("struggling_topics", [])) > 2):  # Struggling with more than 2 topics
-                    immediate_attention.append(student_id)
-                # Check for monitoring
-                elif (metrics.get("concept_understanding") == "moderate" or
-                    metrics.get("participation_level", 0) < 0.5):  # Less than 50% participation
-                    monitoring_required.append(student_id)
-            # Add enriched data to analytics
-            analytics["course_health"] = {
-                "overall_engagement": overall_engagement,
-                "critical_topics": critical_topics,
-                "class_distribution": class_distribution
-            }
-            analytics["intervention_metrics"] = {
-                "immediate_attention_needed": immediate_attention,
-                "monitoring_required": monitoring_required
-            }
-            # Add evidence for enriched metrics
-            analytics["course_health"]["evidence"] = {
-                "engagement_calculation": f"Calculated from average participation level of {total_students} students",
-                "critical_topics_criteria": "Topics where over 70% of students are struggling",
-                "distribution_calculation": "Based on concept understanding levels from student metrics"
-            }
-            analytics["intervention_metrics"]["evidence"] = {
-                "immediate_attention_criteria": "Students with low understanding, participation < 30%, or >2 struggling topics",
-                "monitoring_criteria": "Students with moderate understanding or participation < 50%"
-            }
-            return analytics
-        except Exception as e:
-            print(f"Error enriching analytics: {str(e)}")
-            return analytics  # Return original analytics if enrichment fails
-    def generate_analytics(self, chat_histories: List[Dict], all_topics: List[str]) -> Dict:
-        """Main method to generate analytics with evidence-based validation."""
-        try:
-            if not chat_histories or not all_topics:
-                print("Missing required input data")
-                return self._fallback_analytics()
-            try:
-                processed_histories = self._preprocess_chat_histories(chat_histories)
-                print("Successfully preprocessed chat histories")
-            except Exception as preprocess_error:
-                print(f"Error in preprocessing: {str(preprocess_error)}")
-                return self._fallback_analytics()
-            try:
-                prompt = self._create_analytics_prompt(processed_histories, all_topics)
-                print("Successfully created prompt")
-                print("Prompt preview:", prompt[:200] + "...") # Print first 200 chars
-            except Exception as prompt_error:
-                print(f"Error in prompt creation: {str(prompt_error)}")
-                return self._fallback_analytics()
-            # Generate initial analytics with evidence
-            # prompt = self._create_analytics_prompt(chat_histories, all_topics)
-            response = self.model.generate_content(
-                prompt,
-                generation_config=genai.GenerationConfig(
-                    response_mime_type="application/json",
-                    temperature=0.15
-                )
-            )
-            print(response.text)
-            if not response.text:
-                print("Empty response from Gemini")
-                return self._fallback_analytics()
-            # Parse initial analytics
-            # initial_analytics = self._process_gemini_response(response.text)
-            initial_analytics2 = json.loads(response.text)
-            print("Initial analytics:", initial_analytics2)
-            # print("Initial analytics type:", type(initial_analytics2))
-            # print("Moving to validation...")
-            # Validate analytics using evidence
-            validated_analytics = self._validate_analytics_with_evidence(initial_analytics2)
-            # # Enrich with additional metrics
-            final_analytics = self._enrich_analytics(validated_analytics)
-            return final_analytics
-        except Exception as e:
-            print(f"Error generating analytics: {str(e)}")
-            return self._fallback_analytics()
-    def _fallback_analytics(self) -> Dict:
-        """Provide fallback analytics with explanation."""
-        return {
-            "topic_insights": [],
-            "student_insights": [],
-            "recommended_actions": [
-                {
-                    "action": "Review analytics generation process",
-                    "priority": "high",
-                    "target_group": "system_administrators",
-                    "reasoning": "Analytics generation failed",
-                    "expected_impact": "Restore analytics functionality",
-                    "evidence": {
-                        "error": "Analytics generation failed to complete"
-                    }
-                }
-            ],
-            "course_health": {
-                "overall_engagement": 0,
-                "critical_topics": [],
-                "class_distribution": {
-                    "high_performers": 0,
-                    "average_performers": 0,
-                    "at_risk": 0
-                }
-            },
-            "intervention_metrics": {
-                "immediate_attention_needed": [],
-                "monitoring_required": []
-            }
-        }
-    def _process_gemini_response(self, response: str) -> Dict:
-        print("Entered here")
-        try:
-            analytics = json.loads(response, object_hook=json_serializer)
-            if not isinstance(analytics, dict):
-                raise ValueError("Invalid response format")
-            return analytics
-        except Exception as e:
-            print(f"Error processing Gemini response: {str(e)}")
-            return self._fallback_analytics()
-load_dotenv()
-MONGODB_URI = os.getenv("MONGO_URI")
-from file_upload_vectorize import model
-import streamlit as st
-def extract_topics_from_materials(session_id):
-    """Extract topics from pre-class materials"""
-    materials = resources_collection.find({"session_id": session_id})
-    texts = ""
-    if materials:
-        for material in materials:
-            if 'text_content' in material:
-                text = material['text_content']
-                texts += text + "\n"
-            else:
-                st.warning("No text content found in the material.")
-                return
-    else:
-        st.error("No pre-class materials found for this session.")
-        return
-    if texts:
-        context_prompt = f"""
-        Task: Extract Comprehensive Topics in a List Format
-        You are tasked with analyzing the provided text content and extracting a detailed, flat list of topics.
-        Instructions:
-        Identify All Topics: Extract a comprehensive list of all topics, subtopics, and indirect topics present in the provided text content. This list should include:
-        Overarching themes
-        Main topics
-        Subtopics and their sub-subtopics
-        Indirectly related topics
-        Flat List Format: Provide a flat list where each item is a topic. Ensure topics at all levels (overarching, main, sub, sub-sub, indirect) are represented as individual entries in the list.
-        Be Exhaustive: Ensure the response captures every topic, subtopic, and indirectly related concept comprehensively.
-        Output Requirements:
-        Use this structure:
-        {{
-            "topics": [
-                "Topic 1",
-                "Topic 2",
-                "Topic 3",
-                ...
-            ]
-        }}
-        Do Not Include: Do not include backticks, hierarchical structures, or the word 'json' in your response.
-        Content to Analyze:
-        {texts}
-        """
-        try:
-            # response = model.generate_content(context_prompt, generation_config=genai.GenerationConfig(response_mime_type="application/json", response_schema=list[Topics]))
-            response = model.generate_content(context_prompt, generation_config=genai.GenerationConfig(temperature=0.3))
-            if not response or not response.text:
-                st.error("Error extracting topics from materials.")
-                return
-            topics = response.text
-            return topics
-        except Exception as e:
-            st.error(f"Error extracting topics: {str(e)}")
-            return None
-    else:
-        st.error("No text content found in the pre-class materials.")
-        return None
-def get_chat_history(user_id, session_id):
-    query = {
-        "user_id": ObjectId(user_id),
-        "session_id": session_id,
-        "timestamp": {"$lte": datetime.utcnow()}
-    }
-    result = chat_history_collection.find(query)
-    return list(result)
-def json_serializer(obj):
-    if isinstance(obj, ObjectId):
-        return str(obj)
-    raise TypeError(f"Type {type(obj)} not serializable")
-if __name__ == "__main__":
-    client = MongoClient(MONGODB_URI)
-    db = client["novascholar_db"]
-    chat_history_collection = db["chat_history"]
-    resources_collection = db["resources"]
-    session_id = "S104"
-    # Connect to MongoDB
-    user_ids = chat_history_collection.distinct("user_id", {"session_id": session_id})
-    # Debug print 2: Check user_ids
-    print("Found user_ids:", user_ids)
-    all_chat_histories = []
-    for user_id in user_ids:
-        result = get_chat_history(user_id, session_id)
-        # Debug print 3: Check each chat history result
-        print(f"Chat history for user {user_id}:", "Found" if result else "Not found")
-        if result:
-            for record in result:
-                chat_history = {
-                    "user_id": record["user_id"],  # Convert ObjectId to string
-                    "session_id": record["session_id"],
-                    "messages": record["messages"]
-                }
-                all_chat_histories.append(chat_history)
-    print(all_chat_histories)
-    # Export all chat histories to a JSON file
-    # Path: sample_files/chat_histories.json
-    # with open("sample_files/all_chat_histories3.json", "w") as file:
-    #     json.dump(all_chat_histories, file, indent=2)
-    # Debug print 4: Check chat histories
-    print("Total chat histories collected:", len(all_chat_histories))
-    # Extract topics with debug print
-    # topics = extract_topics_from_materials(session_id)
-    # # Export extracted topics to a JSON file
-    # with open("sample_files/extracted_topics.json", "w") as file:
-    #     json.dump(topics, file, indent=2)
-    # Load extracted topics from JSON file
-    with open("sample_files/extracted_topics.json", "r") as file:
-        topics = json.load(file)
-    # Debug print 5: Check topics
-    print("Extracted topics:", topics)
-    # Generate analytics
-    analytics_generator = NovaScholarAnalytics()
-    analytics = analytics_generator.generate_analytics(all_chat_histories, topics)
-    # Debug print 6: Check generated analytics
-    print("Generated Analytics:", analytics)

pre_class_analytics4.py DELETED Viewed

@@ -1,592 +0,0 @@
-import pandas as pd
-import numpy as np
-from datetime import datetime
-from typing import List, Dict, Any, Tuple
-import spacy
-from collections import Counter, defaultdict
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-from textblob import TextBlob
-import networkx as nx
-from scipy import stats
-import logging
-import json
-from dataclasses import dataclass
-from enum import Enum
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-class TopicDifficulty(Enum):
-    EASY = "easy"
-    MODERATE = "moderate"
-    DIFFICULT = "difficult"
-    VERY_DIFFICULT = "very_difficult"
-@dataclass
-class QuestionMetrics:
-    complexity_score: float
-    follow_up_count: int
-    clarification_count: int
-    time_spent: float
-    sentiment_score: float
-@dataclass
-class TopicInsights:
-    difficulty_level: TopicDifficulty
-    common_confusion_points: List[str]
-    question_patterns: List[str]
-    time_distribution: Dict[str, float]
-    engagement_metrics: Dict[str, float]
-    recommended_focus_areas: List[str]
-    def to_dict(self):
-        return {
-            "difficulty_level": self.difficulty_level.value,  # Convert enum to its value
-            "common_confusion_points": self.common_confusion_points,
-            "question_patterns": self.question_patterns,
-            "time_distribution": {str(k): v for k, v in self.time_distribution.items()},
-            "engagement_metrics": self.engagement_metrics,
-            "recommended_focus_areas": self.recommended_focus_areas,
-        }
-class PreClassAnalytics:
-    def __init__(self, nlp_model: str = "en_core_web_lg"):
-        """Initialize the analytics system with necessary components."""
-        self.nlp = spacy.load(nlp_model)
-        self.question_indicators = {
-            "what", "why", "how", "when", "where", "which", "who",
-            "whose", "whom", "can", "could", "would", "will", "explain"
-        }
-        self.confusion_indicators = {
-            "confused", "don't understand", "unclear", "not clear",
-            "stuck", "difficult", "hard", "help", "explain again"
-        }
-        self.follow_up_indicators = {
-            "also", "another", "additionally", "furthermore", "moreover",
-            "besides", "related", "similarly", "again"
-        }
-    def preprocess_chat_history(self, chat_history: List[Dict]) -> pd.DataFrame:
-        """Convert chat history to DataFrame with enhanced features."""
-        messages = []
-        for chat in chat_history:
-            user_id = chat['user_id']['$oid']
-            for msg in chat['messages']:
-                try:
-                    # Ensure the timestamp is in the correct format
-                    if isinstance(msg['timestamp'], dict) and '$date' in msg['timestamp']:
-                        timestamp = pd.to_datetime(msg['timestamp']['$date'])
-                    elif isinstance(msg['timestamp'], str):
-                        timestamp = pd.to_datetime(msg['timestamp'])
-                    else:
-                        raise ValueError("Invalid timestamp format")
-                except Exception as e:
-                    print(f"Error parsing timestamp: {msg['timestamp']}, error: {e}")
-                    timestamp = pd.NaT  # Use NaT (Not a Time) for invalid timestamps
-                messages.append({
-                    'user_id': user_id,
-                    'timestamp': timestamp,
-                    'prompt': msg['prompt'],
-                    'response': msg['response'],
-                    'is_question': any(q in msg['prompt'].lower() for q in self.question_indicators),
-                    'shows_confusion': any(c in msg['prompt'].lower() for c in self.confusion_indicators),
-                    'is_followup': any(f in msg['prompt'].lower() for f in self.follow_up_indicators)
-                })
-        df = pd.DataFrame(messages)
-        df['sentiment'] = df['prompt'].apply(lambda x: TextBlob(x).sentiment.polarity)
-        return df
-    def extract_topic_hierarchies(self, df: pd.DataFrame) -> Dict[str, List[str]]:
-        """Extract hierarchical topic relationships from conversations."""
-        topic_hierarchy = defaultdict(list)
-        for _, row in df.iterrows():
-            doc = self.nlp(row['prompt'])
-            # Extract main topics and subtopics using noun chunks and dependencies
-            main_topics = []
-            subtopics = []
-            for chunk in doc.noun_chunks:
-                if chunk.root.dep_ in ('nsubj', 'dobj'):
-                    main_topics.append(chunk.text.lower())
-                else:
-                    subtopics.append(chunk.text.lower())
-            # Build hierarchy
-            for main_topic in main_topics:
-                topic_hierarchy[main_topic].extend(subtopics)
-        # Clean and deduplicate
-        return {k: list(set(v)) for k, v in topic_hierarchy.items()}
-    def analyze_topic_difficulty(self, df: pd.DataFrame, topic: str) -> TopicDifficulty:
-        """Determine topic difficulty based on various metrics."""
-        topic_msgs = df[df['prompt'].str.contains(topic, case=False)]
-        # Calculate difficulty indicators
-        confusion_rate = topic_msgs['shows_confusion'].mean()
-        question_rate = topic_msgs['is_question'].mean()
-        follow_up_rate = topic_msgs['is_followup'].mean()
-        avg_sentiment = topic_msgs['sentiment'].mean()
-        # Calculate composite difficulty score
-        difficulty_score = (
-            confusion_rate * 0.4 +
-            question_rate * 0.3 +
-            follow_up_rate * 0.2 +
-            (1 - (avg_sentiment + 1) / 2) * 0.1
-        )
-        # Map score to difficulty level
-        if difficulty_score < 0.3:
-            return TopicDifficulty.EASY
-        elif difficulty_score < 0.5:
-            return TopicDifficulty.MODERATE
-        elif difficulty_score < 0.7:
-            return TopicDifficulty.DIFFICULT
-        else:
-            return TopicDifficulty.VERY_DIFFICULT
-    def identify_confusion_patterns(self, df: pd.DataFrame, topic: str) -> List[str]:
-        """Identify common patterns in student confusion."""
-        confused_msgs = df[
-            (df['prompt'].str.contains(topic, case=False)) &
-            (df['shows_confusion'])
-        ]['prompt']
-        patterns = []
-        for msg in confused_msgs:
-            doc = self.nlp(msg)
-            # Extract key phrases around confusion indicators
-            for sent in doc.sents:
-                for token in sent:
-                    if token.text.lower() in self.confusion_indicators:
-                        # Get context window around confusion indicator
-                        context = sent.text
-                        patterns.append(context)
-        # Group similar patterns
-        if patterns:
-            vectorizer = TfidfVectorizer(ngram_range=(1, 3))
-            tfidf_matrix = vectorizer.fit_transform(patterns)
-            similarity_matrix = cosine_similarity(tfidf_matrix)
-            # Cluster similar patterns
-            G = nx.Graph()
-            for i in range(len(patterns)):
-                for j in range(i + 1, len(patterns)):
-                    if similarity_matrix[i][j] > 0.5:  # Similarity threshold
-                        G.add_edge(i, j)
-            # Extract representative patterns from each cluster
-            clusters = list(nx.connected_components(G))
-            return [patterns[min(cluster)] for cluster in clusters]
-        return []
-    def analyze_question_patterns(self, df: pd.DataFrame, topic: str) -> List[str]:
-        """Analyze patterns in student questions about the topic."""
-        topic_questions = df[
-            (df['prompt'].str.contains(topic, case=False)) &
-            (df['is_question'])
-        ]['prompt']
-        question_types = defaultdict(list)
-        for question in topic_questions:
-            doc = self.nlp(question)
-            # Categorize questions
-            if any(token.text.lower() in {"what", "define", "explain"} for token in doc):
-                question_types["conceptual"].append(question)
-            elif any(token.text.lower() in {"how", "steps", "process"} for token in doc):
-                question_types["procedural"].append(question)
-            elif any(token.text.lower() in {"why", "reason", "because"} for token in doc):
-                question_types["reasoning"].append(question)
-            else:
-                question_types["other"].append(question)
-        # Extract patterns from each category
-        patterns = []
-        for category, questions in question_types.items():
-            if questions:
-                vectorizer = TfidfVectorizer(ngram_range=(1, 3))
-                tfidf_matrix = vectorizer.fit_transform(questions)
-                # Get most representative questions
-                feature_array = np.mean(tfidf_matrix.toarray(), axis=0)
-                tfidf_sorting = np.argsort(feature_array)[::-1]
-                features = vectorizer.get_feature_names_out()
-                patterns.append(f"{category}: {' '.join(features[tfidf_sorting[:3]])}")
-        return patterns
-    def analyze_time_distribution(self, df: pd.DataFrame, topic: str) -> Dict[str, float]:
-        """Analyze time spent on different aspects of the topic."""
-        topic_msgs = df[df['prompt'].str.contains(topic, case=False)].copy()
-        if len(topic_msgs) < 2:
-            return {}
-        topic_msgs['time_diff'] = topic_msgs['timestamp'].diff()
-        # Calculate time distribution
-        distribution = {
-            'total_time': topic_msgs['time_diff'].sum().total_seconds() / 60,
-            'avg_time_per_message': topic_msgs['time_diff'].mean().total_seconds() / 60,
-            'max_time_gap': topic_msgs['time_diff'].max().total_seconds() / 60,
-            'time_spent_on_questions': topic_msgs[topic_msgs['is_question']]['time_diff'].sum().total_seconds() / 60,
-            'time_spent_on_confusion': topic_msgs[topic_msgs['shows_confusion']]['time_diff'].sum().total_seconds() / 60
-        }
-        return distribution
-    def calculate_engagement_metrics(self, df: pd.DataFrame, topic: str) -> Dict[str, float]:
-        """Calculate student engagement metrics for the topic."""
-        topic_msgs = df[df['prompt'].str.contains(topic, case=False)]
-        metrics = {
-            'message_count': len(topic_msgs),
-            'question_ratio': topic_msgs['is_question'].mean(),
-            'confusion_ratio': topic_msgs['shows_confusion'].mean(),
-            'follow_up_ratio': topic_msgs['is_followup'].mean(),
-            'avg_sentiment': topic_msgs['sentiment'].mean(),
-            'engagement_score': 0.0  # Will be calculated below
-        }
-        # Calculate engagement score
-        metrics['engagement_score'] = (
-            metrics['message_count'] * 0.3 +
-            metrics['question_ratio'] * 0.25 +
-            metrics['follow_up_ratio'] * 0.25 +
-            (metrics['avg_sentiment'] + 1) / 2 * 0.2  # Normalize sentiment to 0-1
-        )
-        return metrics
-    def generate_topic_insights(self, df: pd.DataFrame, topic: str) -> TopicInsights:
-        """Generate comprehensive insights for a topic."""
-        difficulty = self.analyze_topic_difficulty(df, topic)
-        confusion_points = self.identify_confusion_patterns(df, topic)
-        question_patterns = self.analyze_question_patterns(df, topic)
-        time_distribution = self.analyze_time_distribution(df, topic)
-        engagement_metrics = self.calculate_engagement_metrics(df, topic)
-        # Generate recommended focus areas based on insights
-        focus_areas = []
-        if difficulty in (TopicDifficulty.DIFFICULT, TopicDifficulty.VERY_DIFFICULT):
-            focus_areas.append("Fundamental concept reinforcement needed")
-        if confusion_points:
-            focus_areas.append(f"Address common confusion around: {', '.join(confusion_points[:3])}")
-        if engagement_metrics['confusion_ratio'] > 0.3:
-            focus_areas.append("Consider alternative teaching approaches")
-        if time_distribution.get('time_spent_on_questions', 0) > time_distribution.get('total_time', 0) * 0.5:
-            focus_areas.append("More practical examples or demonstrations needed")
-        return TopicInsights(
-            difficulty_level=difficulty,
-            common_confusion_points=confusion_points,
-            question_patterns=question_patterns,
-            time_distribution=time_distribution,
-            engagement_metrics=engagement_metrics,
-            recommended_focus_areas=focus_areas
-        )
-    def analyze_student_progress(self, df: pd.DataFrame) -> Dict[str, Any]:
-        """Analyze individual student progress and learning patterns."""
-        student_progress = {}
-        for student_id in df['user_id'].unique():
-            student_msgs = df[df['user_id'] == student_id]
-            # Calculate student-specific metrics
-            progress = {
-                'total_messages': len(student_msgs),
-                'questions_asked': student_msgs['is_question'].sum(),
-                'confusion_instances': student_msgs['shows_confusion'].sum(),
-                'avg_sentiment': student_msgs['sentiment'].mean(),
-                'topic_engagement': {},
-                'learning_pattern': self._identify_learning_pattern(student_msgs)
-            }
-            # Analyze topic-specific engagement
-            topics = self.extract_topic_hierarchies(student_msgs)
-            for topic in topics:
-                topic_msgs = student_msgs[student_msgs['prompt'].str.contains(topic, case=False)]
-                progress['topic_engagement'][topic] = {
-                    'message_count': len(topic_msgs),
-                    'confusion_rate': topic_msgs['shows_confusion'].mean(),
-                    'sentiment_trend': stats.linregress(
-                        range(len(topic_msgs)),
-                        topic_msgs['sentiment']
-                    ).slope
-                }
-            student_progress[student_id] = progress
-        return student_progress
-    def _identify_learning_pattern(self, student_msgs: pd.DataFrame) -> str:
-        """Identify student's learning pattern based on their interaction style."""
-        # Calculate key metrics
-        question_ratio = student_msgs['is_question'].mean()
-        confusion_ratio = student_msgs['shows_confusion'].mean()
-        follow_up_ratio = student_msgs['is_followup'].mean()
-        sentiment_trend = stats.linregress(
-            range(len(student_msgs)),
-            student_msgs['sentiment']
-        ).slope
-        # Identify pattern
-        if question_ratio > 0.6:
-            return "Inquisitive Learner"
-        elif confusion_ratio > 0.4:
-            return "Needs Additional Support"
-        elif follow_up_ratio > 0.5:
-            return "Deep Dive Learner"
-        elif sentiment_trend > 0:
-            return "Progressive Learner"
-        else:
-            return "Steady Learner"
-    def generate_comprehensive_report(self, chat_history: List[Dict]) -> Dict[str, Any]:
-        """Generate a comprehensive analytics report."""
-        # Preprocess chat history
-        df = self.preprocess_chat_history(chat_history)
-        # Extract topics
-        topics = self.extract_topic_hierarchies(df)
-        report = {
-            'topics': {},
-            'student_progress': self.analyze_student_progress(df),
-            'overall_metrics': {
-                'total_conversations': len(df),
-                'unique_students': df['user_id'].nunique(),
-                'avg_sentiment': df['sentiment'].mean(),
-                'most_discussed_topics': Counter(
-                    topic for topics_list in topics.values()
-                    for topic in topics_list
-                ).most_common(5)
-            }
-        }
-        # Generate topic-specific insights
-        for main_topic, subtopics in topics.items():
-            subtopic_insights = {}
-            for subtopic in subtopics:
-                subtopic_insights[subtopic] = {
-                    'insights': self.generate_topic_insights(df, subtopic),
-                    'related_topics': [t for t in subtopics if t != subtopic],
-                    'student_engagement': {
-                        student_id: self.calculate_engagement_metrics(
-                            df[df['user_id'] == student_id],
-                            subtopic
-                        )
-                        for student_id in df['user_id'].unique()
-                    }
-                }
-            report['topics'][main_topic] = {
-                'insights': self.generate_topic_insights(df, main_topic),
-                'subtopics': subtopic_insights,
-                'topic_relationships': {
-                    'hierarchy_depth': len(subtopics),
-                    'connection_strength': self._calculate_topic_connections(df, main_topic, subtopics),
-                    'progression_path': self._identify_topic_progression(df, main_topic, subtopics)
-                }
-            }
-        # Add temporal analysis
-        report['temporal_analysis'] = {
-            'daily_engagement': df.groupby(df['timestamp'].dt.date).agg({
-                'user_id': 'count',
-                'is_question': 'sum',
-                'shows_confusion': 'sum',
-                'sentiment': 'mean'
-            }).to_dict(),
-            'peak_activity_hours': df.groupby(df['timestamp'].dt.hour)['user_id'].count().nlargest(3).to_dict(),
-            'learning_trends': self._analyze_learning_trends(df)
-        }
-        # Add recommendations
-        report['recommendations'] = self._generate_recommendations(report)
-        return report
-    def _calculate_topic_connections(self, df: pd.DataFrame, main_topic: str, subtopics: List[str]) -> Dict[str, float]:
-        """Calculate connection strength between topics based on co-occurrence."""
-        connections = {}
-        main_topic_msgs = df[df['prompt'].str.contains(main_topic, case=False)]
-        for subtopic in subtopics:
-            cooccurrence = df[
-                df['prompt'].str.contains(main_topic, case=False) &
-                df['prompt'].str.contains(subtopic, case=False)
-            ].shape[0]
-            connection_strength = cooccurrence / len(main_topic_msgs) if len(main_topic_msgs) > 0 else 0
-            connections[subtopic] = connection_strength
-        return connections
-    def _identify_topic_progression(self, df: pd.DataFrame, main_topic: str, subtopics: List[str]) -> List[str]:
-        """Identify optimal topic progression path based on student interactions."""
-        topic_difficulties = {}
-        for subtopic in subtopics:
-            difficulty = self.analyze_topic_difficulty(df, subtopic)
-            topic_difficulties[subtopic] = difficulty.value
-        # Sort subtopics by difficulty
-        return sorted(subtopics, key=lambda x: topic_difficulties[x])
-    def _analyze_learning_trends(self, df: pd.DataFrame) -> Dict[str, Any]:
-        """Analyze overall learning trends across the dataset."""
-        return {
-            'sentiment_trend': stats.linregress(
-                range(len(df)),
-                df['sentiment']
-            )._asdict(),
-            'confusion_trend': stats.linregress(
-                range(len(df)),
-                df['shows_confusion']
-            )._asdict(),
-            'engagement_progression': self._calculate_engagement_progression(df)
-        }
-    def _calculate_engagement_progression(self, df: pd.DataFrame) -> Dict[str, float]:
-        """Calculate how student engagement changes over time."""
-        df['week'] = df['timestamp'].dt.isocalendar().week
-        weekly_engagement = df.groupby('week').agg({
-            'is_question': 'mean',
-            'shows_confusion': 'mean',
-            'is_followup': 'mean',
-            'sentiment': 'mean'
-        })
-        return {
-            'question_trend': stats.linregress(
-                range(len(weekly_engagement)),
-                weekly_engagement['is_question']
-            ).slope,
-            'confusion_trend': stats.linregress(
-                range(len(weekly_engagement)),
-                weekly_engagement['shows_confusion']
-            ).slope,
-            'follow_up_trend': stats.linregress(
-                range(len(weekly_engagement)),
-                weekly_engagement['is_followup']
-            ).slope,
-            'sentiment_trend': stats.linregress(
-                range(len(weekly_engagement)),
-                weekly_engagement['sentiment']
-            ).slope
-        }
-    def _generate_recommendations(self, report: Dict[str, Any]) -> List[str]:
-        """Generate actionable recommendations based on the analysis."""
-        recommendations = []
-        # Analyze difficulty distribution
-        difficult_topics = [
-            topic for topic, data in report['topics'].items()
-            if data['insights'].difficulty_level in
-            (TopicDifficulty.DIFFICULT, TopicDifficulty.VERY_DIFFICULT)
-        ]
-        if difficult_topics:
-            recommendations.append(
-                f"Consider providing additional resources for challenging topics: {', '.join(difficult_topics)}"
-            )
-        # Analyze student engagement
-        avg_engagement = np.mean([
-            progress['questions_asked'] / progress['total_messages']
-            for progress in report['student_progress'].values()
-        ])
-        if avg_engagement < 0.3:
-            recommendations.append(
-                "Implement more interactive elements to increase student engagement"
-            )
-        # Analyze temporal patterns
-        peak_hours = list(report['temporal_analysis']['peak_activity_hours'].keys())
-        recommendations.append(
-            f"Consider scheduling additional support during peak activity hours: {peak_hours}"
-        )
-        # Analyze learning trends
-        # sentiment_trend = report['temporal_analysis']['learning_trends']['sentiment_trend']
-        # if sentiment_trend < 0:
-        #     recommendations.append(
-        #         "Review teaching approach to address declining student satisfaction"
-        #     )
-        # Analyze learning trends
-        # Analyze learning trends
-        sentiment_trend = report.get('temporal_analysis', {}).get('learning_trends', {}).get('sentiment_trend', None)
-        if isinstance(sentiment_trend, (int, float)):
-            if sentiment_trend < 0:
-                recommendations.append(
-                    "Review teaching approach to address declining student satisfaction"
-                )
-        elif isinstance(sentiment_trend, dict):
-            # Handle the case where sentiment_trend is a dictionary
-            print(f"Unexpected dict format for sentiment_trend: {sentiment_trend}")
-        else:
-            print(f"Unexpected type for sentiment_trend: {type(sentiment_trend)}")
-        return recommendations
-class CustomJSONEncoder(json.JSONEncoder):
-    def default(self, obj):
-        if isinstance(obj, TopicDifficulty):
-            return obj.value
-        if isinstance(obj, TopicInsights):
-            return obj.to_dict()
-        if isinstance(obj, np.integer):
-            return int(obj)
-        if isinstance(obj, np.floating):
-            return float(obj)
-        if isinstance(obj, np.ndarray):
-            return obj.tolist()
-        if isinstance(obj, datetime):
-            return obj.isoformat()
-        return super().default(obj)
-def convert_insights_to_dict(report):
-    for main_topic, data in report['topics'].items():
-        if isinstance(data['insights'], TopicInsights):
-            data['insights'] = data['insights'].to_dict()
-        for subtopic, subdata in data['subtopics'].items():
-            if isinstance(subdata['insights'], TopicInsights):
-                subdata['insights'] = subdata['insights'].to_dict()
-if __name__ == "__main__":
-    # Load chat history data
-    chat_history = None
-    with open('sample_files/chat_history_corpus.json', 'r', encoding="utf-8") as file:
-        chat_history = json.load(file)
-    # Initialize analytics system
-    analytics = PreClassAnalytics()
-    # Generate comprehensive report
-    report = analytics.generate_comprehensive_report(chat_history)
-    # Convert insights to dictionary
-    # convert_insights_to_dict(report)
-    print(json.dumps(report, indent=4, cls=CustomJSONEncoder))
-    # print(report)

session_page.py CHANGED Viewed

@@ -34,6 +34,7 @@ from bs4 import BeautifulSoup
 import streamlit.components.v1 as components
 from live_chat_feature import display_live_chat_interface
 from code_playground import display_code_playground
 # Load environment variables
 load_dotenv()
@@ -550,36 +551,107 @@ def display_preclass_content(session, student_id, course_id):
 import requests
 def fetch_youtube_video_title(video_url):
-    """Fetch the title of a YouTube video using the YouTube Data API"""
     api_key = os.getenv("YOUTUBE_API_KEY")
     video_id = extract_youtube_id(video_url)
     if not video_id:
         return None
     url = f"https://www.googleapis.com/youtube/v3/videos?id={video_id}&key={api_key}&part=snippet"
-    response = requests.get(url)
-    if response.status_code == 200:
         data = response.json()
-        if "items" in data and len(data["items"]) > 0:
-            return data["items"][0]["snippet"]["title"]
-    return None
 def upload_video_source(course_id, session_id, video_url):
-    """Upload video source and its transcript to the database"""
     # Fetch video title
     video_title = fetch_youtube_video_title(video_url)
     if not video_title:
-        st.error("Could not fetch the video title from the provided YouTube URL.")
-        return
-    # print("Video Title: ", video_title)
-    # Extract transcript from YouTube video
     transcript = extract_youtube_transcript(video_url)
     if not transcript:
-        st.error("Could not extract transcript from the provided YouTube URL.")
-        return
     # Create resource document
     resource_data = {
         "_id": ObjectId(),
@@ -590,37 +662,89 @@ def upload_video_source(course_id, session_id, video_url):
         "text_content": transcript,
         "material_type": "video",
         "source_url": video_url,
-        "uploaded_at": datetime.utcnow()
-    }
     # Check if resource already exists
     existing_resource = resources_collection.find_one({
         "session_id": session_id,
-        "source_url": video_url
     })
     if existing_resource:
-        st.warning("This video resource already exists.")
         return existing_resource["_id"]
-    # Insert new resource
-    resources_collection.insert_one(resource_data)
-    resource_id = resource_data["_id"]
-    # Update course document
-    courses_collection.update_one(
-        {
-            "course_id": course_id,
-            "sessions.session_id": session_id
-        },
-        {
-            "$push": {"sessions.$.pre_class.resources": resource_id}
-        }
-    )
-    # Create vector store for the transcript
-    create_vector_store(transcript, resource_id)
-    # st.success("Video source uploaded successfully!")
-    return resource_id
 def upload_preclass_materials(session_id, course_id):
     """Upload pre-class materials and manage external resources for a session"""
@@ -648,8 +772,8 @@ def upload_preclass_materials(session_id, course_id):
         if st.button("Upload Video"):
             with st.spinner("Processing video source..."):
                 video_resource_id = upload_video_source(course_id, session_id, video_url)
-                if video_resource_id:
-                    st.success("Video source uploaded successfully!")
     with external_tab:
         # Fetch and display external resources
@@ -717,7 +841,10 @@ def upload_preclass_materials(session_id, course_id):
     for material_type, resources in grouped_resources.items():
         st.markdown(f"##### {material_type.capitalize()} Resources")
         for material in resources:
-            st.markdown(f"- **{material['file_name']}** ({material['file_type']})")
 def extract_external_content(url, content_type):
     """Extract content from external resources based on their type"""
@@ -731,18 +858,57 @@ def extract_external_content(url, content_type):
         return None
 def extract_youtube_transcript(url):
-    """Extract transcript from YouTube videos"""
     try:
-        # Extract video ID from URL
-        video_id = url.split('v=')[1].split('&')[0]
-        # Get transcript
-        transcript = YouTubeTranscriptApi.get_transcript(video_id)
-        # Combine transcript text
-        full_text = ' '.join([entry['text'] for entry in transcript])
-        return full_text
     except Exception as e:
-        st.error(f"Could not extract YouTube transcript: {str(e)}")
         return None
 def extract_web_article(url):
@@ -813,17 +979,54 @@ def upload_external_resource(course_id, session_id, title, content, content_type
     return resource_id
 def extract_youtube_id(url):
-    """Extract YouTube video ID from URL"""
-    if 'youtube.com' in url:
-        try:
-            return url.split('v=')[1].split('&')[0]
-        except IndexError:
-            return None
-    elif 'youtu.be' in url:
-        try:
-            return url.split('/')[-1]
-        except IndexError:
-            return None
     return None
 def display_live_presentation(session, user_type, course_id):

 import streamlit.components.v1 as components
 from live_chat_feature import display_live_chat_interface
 from code_playground import display_code_playground
+from urllib.parse import urlparse, parse_qs
 # Load environment variables
 load_dotenv()
 import requests
+def get_supported_url_formats():
+    """Return a list of supported URL formats for faculty reference"""
+    return """
+    Supported YouTube URL formats:
+    1. Standard watch URL: https://www.youtube.com/watch?v=VIDEO_ID
+    2. Short URL: https://youtu.be/VIDEO_ID
+    3. Embed URL: https://www.youtube.com/embed/VIDEO_ID
+    4. Mobile URL: https://m.youtube.com/watch?v=VIDEO_ID
+    5. YouTube Shorts: https://www.youtube.com/shorts/VIDEO_ID
+    You can copy any of these formats from:
+    - YouTube website (Share button)
+    - YouTube mobile app (Share button)
+    - Browser address bar while watching the video
+    """
+def display_url_guidance():
+    """Display guidance for faculty on how to get the correct URL"""
+    st.info("""
+    📝 How to get the correct YouTube URL:
+    1. Go to the YouTube video you want to share
+    2. Click the 'Share' button below the video
+    3. Copy the URL provided in the share dialog
+    4. Paste it here
+    The URL should start with either 'youtube.com' or 'youtu.be'
+    """)
 def fetch_youtube_video_title(video_url):
+    """
+    Fetch the title of a YouTube video with detailed error handling
+    """
     api_key = os.getenv("YOUTUBE_API_KEY")
+    if not api_key:
+        st.error("⚠️ System Configuration Error: YouTube API key not configured.")
+        st.write("Please contact technical support for assistance.")
+        return None
     video_id = extract_youtube_id(video_url)
     if not video_id:
         return None
     url = f"https://www.googleapis.com/youtube/v3/videos?id={video_id}&key={api_key}&part=snippet"
+    try:
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
         data = response.json()
+        if not data.get("items"):
+            st.error("⚠️ Video not found or might be private.")
+            st.write("""
+            Please check if:
+            1. The video is publicly available
+            2. The URL is correct
+            3. The video hasn't been deleted
+            """)
+            return None
+        return data["items"][0]["snippet"]["title"]
+    except requests.exceptions.RequestException as e:
+        if "quotaExceeded" in str(e):
+            st.error("⚠️ YouTube API quota exceeded.")
+            st.write("""
+            The system has reached its daily limit for video processing.
+            Please try:
+            1. Waiting a few hours
+            2. Trying again tomorrow
+            3. Contact support if the issue persists
+            """)
+        else:
+            st.error(f"Error fetching video title: {str(e)}")
+            st.write("Please try again or choose a different video.")
+        return None
 def upload_video_source(course_id, session_id, video_url):
+    """
+    Upload video source and its transcript with comprehensive error handling
+    """
+    if not video_url:
+        st.error("Please provide a YouTube URL.")
+        display_url_guidance()
+        return None
+    # Display processing message
+    # with st.spinner("Processing your YouTube video..."):
+        # Validate video URL
+    video_id = extract_youtube_id(video_url)
+    if not video_id:
+        return None
     # Fetch video title
     video_title = fetch_youtube_video_title(video_url)
     if not video_title:
+        return None
+    # Extract transcript
     transcript = extract_youtube_transcript(video_url)
     if not transcript:
+        return None
     # Create resource document
     resource_data = {
         "_id": ObjectId(),
         "text_content": transcript,
         "material_type": "video",
         "source_url": video_url,
+        "uploaded_at": datetime.utcnow(),
+        "video_id": video_id
+    }
     # Check if resource already exists
     existing_resource = resources_collection.find_one({
         "session_id": session_id,
+        "video_id": video_id
     })
     if existing_resource:
+        st.warning("⚠️ This video has already been added to this session.")
+        st.write("""
+        Options:
+        1. Choose a different video
+        2. Use the existing video resource
+        3. Remove the existing video first if you want to re-add it
+        """)
         return existing_resource["_id"]
+    try:
+        # Insert new resource
+        result = resources_collection.insert_one(resource_data)
+        resource_id = result.inserted_id
+        # Update course document
+        update_result = courses_collection.update_one(
+            {
+                "course_id": course_id,
+                "sessions.session_id": session_id
+            },
+            {
+                "$push": {"sessions.$.pre_class.resources": resource_id}
+            }
+        )
+        if update_result.modified_count == 0:
+            st.error("⚠️ Failed to update course with new resource.")
+            st.write("""
+            The video was processed but couldn't be added to the course.
+            This might be because:
+            1. The course or session ID is invalid
+            2. You don't have permission to modify this course
+            3. There was a system error
+            Please try again or contact support if the issue persists.
+            """)
+            # Rollback resource insertion
+            resources_collection.delete_one({"_id": resource_id})
+            return None
+        # Create vector store for the transcript
+        # create_vector_store(transcript, resource_id)
+        # Create vector store for the transcript
+        vector_store_result = create_vector_store(transcript, resource_id)
+        if not vector_store_result:
+            st.error("⚠️ Failed to create vector store for the transcript.")
+            # Rollback insertions
+            resources_collection.delete_one({"_id": resource_id})
+            return None
+        st.success("✅ Video successfully added to your course!")
+        st.write(f"""
+        Added: "{video_title}"
+        You can now:
+        1. Add more videos
+        2. Preview the added video
+        3. Continue building your course
+        """)
+        return resource_id
+    except Exception as e:
+        st.error("⚠️ Error uploading video source.")
+        st.write(f"""
+        There was an error while saving the video:
+        {str(e)}
+        Please:
+        1. Try again
+        2. Choose a different video
+        3. Contact support if the issue persists
+        """)
+        return None
 def upload_preclass_materials(session_id, course_id):
     """Upload pre-class materials and manage external resources for a session"""
         if st.button("Upload Video"):
             with st.spinner("Processing video source..."):
                 video_resource_id = upload_video_source(course_id, session_id, video_url)
+                # if video_resource_id:
+                #     st.success("Video source uploaded successfully!")
     with external_tab:
         # Fetch and display external resources
     for material_type, resources in grouped_resources.items():
         st.markdown(f"##### {material_type.capitalize()} Resources")
         for material in resources:
+            resource_info = f"- **{material['file_name']}** ({material['file_type']})"
+            if 'source_url' in material:
+                resource_info += f" - [URL]({material['source_url']})"
+            st.markdown(resource_info)
 def extract_external_content(url, content_type):
     """Extract content from external resources based on their type"""
         return None
 def extract_youtube_transcript(url):
+    """
+    Extract transcript from YouTube videos with detailed error handling
+    """
     try:
+        video_id = extract_youtube_id(url)
+        if not video_id:
+            return None
+        # Get transcript with retries
+        max_retries = 3
+        for attempt in range(max_retries):
+            try:
+                transcript = YouTubeTranscriptApi.get_transcript(video_id)
+                # Combine transcript text with proper spacing and punctuation
+                full_text = ''
+                for entry in transcript:
+                    text = entry['text'].strip()
+                    if text:
+                        if not full_text.endswith(('.', '!', '?', '..."')):
+                            full_text += '. '
+                        full_text += text + ' '
+                return full_text.strip()
+            except Exception as e:
+                if attempt == max_retries - 1:
+                    raise e
+                continue
     except Exception as e:
+        error_message = str(e)
+        if "Video unavailable" in error_message:
+            st.error("⚠️ This video is unavailable or private. Please check if:")
+            st.write("""
+            - The video is set to public or unlisted
+            - The video hasn't been deleted
+            - You have the correct URL
+            """)
+        elif "Subtitles are disabled" in error_message:
+            st.error("⚠️ This video doesn't have subtitles/transcript available.")
+            st.write("""
+            Unfortunately, this video cannot be used because:
+            - It doesn't have closed captions or subtitles
+            - The creator hasn't enabled transcript generation
+            Please choose another video that has subtitles available.
+            You can check if a video has subtitles by:
+            1. Playing the video on YouTube
+            2. Clicking the 'CC' button in the video player
+            """)
+        else:
+            st.error(f"Could not extract YouTube transcript: {error_message}")
+            st.write("Please try again or choose a different video.")
         return None
 def extract_web_article(url):
     return resource_id
 def extract_youtube_id(url):
+    """
+    Extract YouTube video ID from various URL formats
+    """
+    if not url:
+        st.error("Please provide a YouTube URL.")
+        display_url_guidance()
+        return None
+    # Clean the URL
+    url = url.strip()
+    # Basic URL validation
+    if not ('youtube.com' in url or 'youtu.be' in url):
+        st.error("This doesn't appear to be a YouTube URL.")
+        st.write(get_supported_url_formats())
+        return None
+    # Try to extract using regex patterns
+    patterns = [
+        r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/v\/|youtube\.com\/e\/|youtube\.com\/shorts\/)([^&\n?#]+)',
+        r'(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?\/\s]{11})'
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, url)
+        if match:
+            video_id = match.group(1)
+            if len(video_id) != 11:  # YouTube IDs are always 11 characters
+                st.error("Invalid YouTube video ID length. Please check your URL.")
+                display_url_guidance()
+                return None
+            return video_id
+    # If regex fails, try parsing URL components
+    try:
+        parsed_url = urlparse(url)
+        if 'youtube.com' in parsed_url.netloc:
+            query_params = parse_qs(parsed_url.query)
+            if 'v' in query_params:
+                return query_params['v'][0]
+        elif 'youtu.be' in parsed_url.netloc:
+            return parsed_url.path.lstrip('/')
+    except Exception:
+        pass
+    # If all extraction methods fail
+    st.error("Could not extract video ID from the provided URL.")
+    st.write(get_supported_url_formats())
     return None
 def display_live_presentation(session, user_type, course_id):