import os import re import json import time import gradio as gr import tempfile from typing import Dict, Any, List, Optional from transformers import AutoTokenizer from sentence_transformers import SentenceTransformer from pydantic import BaseModel, Field from anthropic import Anthropic from huggingface_hub import login CLAUDE_MODEL = "claude-3-5-sonnet-20241022" OPENAI_MODEL = "gpt-4o" GEMINI_MODEL = "gemini-2.0-flash" DEFAULT_TEMPERATURE = 0.7 TOKENIZER_MODEL = "answerdotai/ModernBERT-base" SENTENCE_TRANSFORMER_MODEL = "all-MiniLM-L6-v2" class CourseInfo(BaseModel): course_name: str = Field(description="Name of the course") section_name: str = Field(description="Name of the course section") lesson_name: str = Field(description="Name of the lesson") class QuizOption(BaseModel): text: str = Field(description="The text of the answer option") correct: bool = Field(description="Whether this option is correct") class QuizQuestion(BaseModel): question: str = Field(description="The text of the quiz question") options: List[QuizOption] = Field(description="List of answer options") class Segment(BaseModel): segment_number: int = Field(description="The segment number") topic_name: str = Field(description="Unique and specific topic name that clearly differentiates it from other segments") key_concepts: List[str] = Field(description="3-5 key concepts discussed in the segment") summary: str = Field(description="Brief summary of the segment (3-5 sentences)") quiz_questions: List[QuizQuestion] = Field(description="5 quiz questions based on the segment content") class TextSegmentAnalysis(BaseModel): course_info: CourseInfo = Field(description="Information about the course") segments: List[Segment] = Field(description="List of text segments with analysis") hf_token = os.environ.get('HF_TOKEN', None) login(token=hf_token) tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_MODEL) sentence_model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL) # System prompt system_prompt = """You are an expert educational content analyzer. Your task is to analyze text content, identify distinct segments, and create high-quality educational quiz questions for each segment.""" def clean_text(text): text = re.sub(r'\[speaker_\d+\]', '', text) text = re.sub(r'\s+', ' ', text).strip() return text def split_text_by_tokens(text, max_tokens=8000): text = clean_text(text) tokens = tokenizer.encode(text) if len(tokens) <= max_tokens: return [text] split_point = len(tokens) // 2 sentences = re.split(r'(?<=[.!?])\s+', text) first_half = [] second_half = [] current_tokens = 0 for sentence in sentences: sentence_tokens = len(tokenizer.encode(sentence)) if current_tokens + sentence_tokens <= split_point: first_half.append(sentence) current_tokens += sentence_tokens else: second_half.append(sentence) return [" ".join(first_half), " ".join(second_half)] def generate_with_claude(text, api_key, course_name="", section_name="", lesson_name=""): from prompts import SYSTEM_PROMPT, ANALYSIS_PROMPT_TEMPLATE_CLAUDE client = Anthropic(api_key=api_key) segment_analysis_schema = TextSegmentAnalysis.model_json_schema() tools = [ { "name": "build_segment_analysis", "description": "Build the text segment analysis with quiz questions", "input_schema": segment_analysis_schema } ] system_prompt = """You are a helpful assistant specialized in text analysis and educational content creation. You analyze texts to identify distinct segments, create summaries, and generate quiz questions.""" prompt = prompt = ANALYSIS_PROMPT_TEMPLATE_CLAUDE.format( course_name=course_name, section_name=section_name, lesson_name=lesson_name, text=text ) try: response = client.messages.create( model=CLAUDE_MODEL, max_tokens=8192, temperature=DEFAULT_TEMPERATURE, system=system_prompt, messages=[ { "role": "user", "content": prompt } ], tools=tools, tool_choice={"type": "tool", "name": "build_segment_analysis"} ) # Extract the tool call content if response.content and len(response.content) > 0 and hasattr(response.content[0], 'input'): function_call = response.content[0].input return function_call else: raise Exception("No valid tool call found in the response") except Exception as e: raise Exception(f"Error calling Anthropic API: {str(e)}") def get_llm_by_api_key(api_key): if api_key.startswith("sk-ant-"): # Claude API key format from langchain_anthropic import ChatAnthropic return ChatAnthropic( anthropic_api_key=api_key, model_name=CLAUDE_MODEL, temperature=DEFAULT_TEMPERATURE, max_retries=3 ) elif api_key.startswith("sk-"): # OpenAI API key format from langchain_openai import ChatOpenAI return ChatOpenAI( openai_api_key=api_key, model_name=OPENAI_MODEL, temperature=DEFAULT_TEMPERATURE, max_retries=3 ) else: # Default to Gemini from langchain_google_genai import ChatGoogleGenerativeAI os.environ["GOOGLE_API_KEY"] = api_key return ChatGoogleGenerativeAI( model=GEMINI_MODEL, temperature=DEFAULT_TEMPERATURE, max_retries=3 ) def segment_and_analyze_text(text: str, api_key: str, course_name="", section_name="", lesson_name="") -> Dict[str, Any]: from prompts import SYSTEM_PROMPT, ANALYSIS_PROMPT_TEMPLATE_GEMINI if api_key.startswith("sk-ant-"): return generate_with_claude(text, api_key, course_name, section_name, lesson_name) # For other models, use LangChain llm = get_llm_by_api_key(api_key) prompt = ANALYSIS_PROMPT_TEMPLATE_GEMINI.format( course_name=course_name, section_name=section_name, lesson_name=lesson_name, text=text ) try: messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": prompt} ] response = llm.invoke(messages) try: content = response.content json_match = re.search(r'```json\s*([\s\S]*?)\s*```', content) if json_match: json_str = json_match.group(1) else: json_match = re.search(r'(\{[\s\S]*\})', content) if json_match: json_str = json_match.group(1) else: json_str = content # Parse the JSON function_call = json.loads(json_str) return function_call except json.JSONDecodeError: raise Exception("Could not parse JSON from LLM response") except Exception as e: raise Exception(f"Error calling API: {str(e)}") def format_quiz_for_display(results): output = [] if "course_info" in results: course_info = results["course_info"] output.append(f"{'='*40}") output.append(f"COURSE: {course_info.get('course_name', 'N/A')}") output.append(f"SECTION: {course_info.get('section_name', 'N/A')}") output.append(f"LESSON: {course_info.get('lesson_name', 'N/A')}") output.append(f"{'='*40}\n") segments = results.get("segments", []) for i, segment in enumerate(segments): topic = segment["topic_name"] segment_num = i + 1 output.append(f"\n\n{'='*40}") output.append(f"SEGMENT {segment_num}: {topic}") output.append(f"{'='*40}\n") output.append("KEY CONCEPTS:") for concept in segment["key_concepts"]: output.append(f"• {concept}") output.append("\nSUMMARY:") output.append(segment["summary"]) output.append("\nQUIZ QUESTIONS:") for i, q in enumerate(segment["quiz_questions"]): output.append(f"\n{i+1}. {q['question']}") for j, option in enumerate(q['options']): letter = chr(97 + j).upper() correct_marker = " ✓" if option["correct"] else "" output.append(f" {letter}. {option['text']}{correct_marker}") return "\n".join(output) def analyze_document(text, api_key, course_name, section_name, lesson_name): try: start_time = time.time() # Split text if it's too long text_parts = split_text_by_tokens(text) all_results = { "course_info": { "course_name": course_name, "section_name": section_name, "lesson_name": lesson_name }, "segments": [] } segment_counter = 1 # Process each part of the text for part in text_parts: analysis = segment_and_analyze_text( part, api_key, course_name=course_name, section_name=section_name, lesson_name=lesson_name ) if "segments" in analysis: for segment in analysis["segments"]: segment["segment_number"] = segment_counter all_results["segments"].append(segment) segment_counter += 1 end_time = time.time() total_time = end_time - start_time # Format the results for display formatted_text = format_quiz_for_display(all_results) formatted_text = f"Total processing time: {total_time:.2f} seconds\n\n" + formatted_text # Create temporary files for JSON and text output json_path = tempfile.mktemp(suffix='.json') with open(json_path, 'w', encoding='utf-8') as json_file: json.dump(all_results, json_file, indent=2) txt_path = tempfile.mktemp(suffix='.txt') with open(txt_path, 'w', encoding='utf-8') as txt_file: txt_file.write(formatted_text) return formatted_text, json_path, txt_path except Exception as e: error_message = f"Error processing document: {str(e)}" return error_message, None, None with gr.Blocks(title="Quiz Generator") as app: gr.Markdown("# Quiz Generator") with gr.Row(): with gr.Column(): course_name = gr.Textbox( placeholder="Enter the course name", label="Course Name" ) section_name = gr.Textbox( placeholder="Enter the section name", label="Section Name" ) lesson_name = gr.Textbox( placeholder="Enter the lesson name", label="Lesson Name" ) with gr.Row(): with gr.Column(): input_text = gr.Textbox( label="Input Document Text", placeholder="Paste your document text here...", lines=10 ) api_key = gr.Textbox( label="API Key", placeholder="Enter your OpenAI, Claude, or Gemini API key", type="password" ) analyze_btn = gr.Button("Analyze Document") with gr.Column(): output_results = gr.Textbox( label="Analysis Results", lines=20 ) json_file_output = gr.File(label="Download JSON") txt_file_output = gr.File(label="Download TXT") analyze_btn.click( fn=analyze_document, inputs=[input_text, api_key, course_name, section_name, lesson_name], outputs=[output_results, json_file_output, txt_file_output] ) if __name__ == "__main__": app.launch()