import re import numpy as np import json from sentence_transformers import SentenceTransformer from transformers import AutoTokenizer from langchain_google_genai import ChatGoogleGenerativeAI import os import gradio as gr import time tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base") sentence_model = SentenceTransformer('all-MiniLM-L6-v2') def clean_text(text): text = re.sub(r'\[speaker_\d+\]', '', text) text = re.sub(r'\s+', ' ', text).strip() return text def split_text_by_tokens(text, max_tokens=8000): text = clean_text(text) tokens = tokenizer.encode(text) if len(tokens) <= max_tokens: return [text] split_point = len(tokens) // 2 sentences = re.split(r'(?<=[.!?])\s+', text) first_half = [] second_half = [] current_tokens = 0 for sentence in sentences: sentence_tokens = len(tokenizer.encode(sentence)) if current_tokens + sentence_tokens <= split_point: first_half.append(sentence) current_tokens += sentence_tokens else: second_half.append(sentence) return [" ".join(first_half), " ".join(second_half)] def analyze_segment_with_gemini(segment_text): llm = ChatGoogleGenerativeAI( model="gemini-1.5-flash", temperature=0.7, max_tokens=None, timeout=None, max_retries=3 ) prompt = f""" Analyze the following text and identify distinct segments within it and do text segmentation: 1. Segments should be STRICTLY max=15 2. For each segment/topic you identify: - Provide a SPECIFIC and UNIQUE topic name (3-5 words) that clearly differentiates it from other segments - List 3-5 key concepts discussed in that segment (be precise and avoid repetition between segments) - Write a brief summary of that segment (3-5 sentences) - Create 5 high-quality, meaningful quiz questions based DIRECTLY on the content in that segment only - Questions and answers should be only from the content of the segment For each quiz question: - Create one correct answer that comes DIRECTLY from the text - Create two plausible but incorrect answers - IMPORTANT: Ensure all answer options have similar length (± 3 words) - Ensure the correct answer is clearly indicated with a ✓ symbol - Questions should **require actual understanding**, NOT just basic fact recall. - Questions Are **non-trivial**, encourage deeper thinking, and **avoid surface-level facts**. - Are **directly based on the segment's content** (not inferred from the summary). - Do **not include questions about document structure** (e.g., title, number of paragraphs). - Do **not generate overly generic or obvious questions** (e.g., "What is mentioned in the text?"). - Focus on **core ideas, logical reasoning, and conceptual understanding**. ADDITIONAL REQUIREMENT: - **First, detect the language of the original text.** - **Generate ALL output (topic names, key concepts, summaries, and quizzes) in the same language as the original text.** - If the text is in Russian, generate all responses in Russian. - If the text is in another language, generate responses in that original language. Text: {segment_text} Format your response as JSON with the following structure: {{ "segments": [ {{ "topic_name": "Unique and Specific Topic Name", "key_concepts": ["concept1", "concept2", "concept3"], "summary": "Brief summary of this segment.", "quiz_questions": [ {{ "question": "Question text?", "options": [ {{ "text": "Option A", "correct": false }}, {{ "text": "Option B", "correct": true }}, {{ "text": "Option C", "correct": false }} ] }} ] }} ] }} IMPORTANT: Each segment must have a DISTINCT topic name that clearly differentiates it from others. - **Do NOT repeat** key concepts across multiple segments unless absolutely necessary. - **Ensure the quiz questions challenge the reader** and **are not easily guessable**. """ response = llm.invoke(prompt) response_text = response.content try: json_match = re.search(r'\{[\s\S]*\}', response_text) if json_match: return json.loads(json_match.group(0)) else: return json.loads(response_text) except json.JSONDecodeError: return { "segments": [ { "topic_name": "JSON Parsing Error", "key_concepts": ["Error in response format"], "summary": "Could not parse the API response.", "quiz_questions": [] } ] } def process_document_with_quiz(text): start_time = time.time() token_count = len(tokenizer.encode(text)) print(f"[LOG] Total document tokens: {token_count}") if token_count > 8000: print(f"[LOG] Document exceeds 8000 tokens. Splitting into parts.") parts = split_text_by_tokens(text) print(f"[LOG] Document split into {len(parts)} parts") for i, part in enumerate(parts): part_tokens = len(tokenizer.encode(part)) print(f"[LOG] Part {i+1} contains {part_tokens} tokens") else: print(f"[LOG] Document under 8000 tokens. Processing as a single part.") parts = [text] all_segments = [] segment_counter = 1 for i, part in enumerate(parts): part_start_time = time.time() print(f"[LOG] Processing part {i+1}...") analysis = analyze_segment_with_gemini(part) if "segments" in analysis: print(f"[LOG] Found {len(analysis['segments'])} segments in part {i+1}") for segment in analysis["segments"]: segment["segment_number"] = segment_counter all_segments.append(segment) print(f"[LOG] Segment {segment_counter}: {segment['topic_name']}") segment_counter += 1 else: # Fallback if response format is unexpected print(f"[LOG] Error: Unexpected format in part {i+1} analysis") fallback_segment = { "topic_name": f"Segment {segment_counter} Analysis", "key_concepts": ["Format error in analysis"], "summary": "Could not properly segment this part of the text.", "quiz_questions": [], "segment_number": segment_counter } all_segments.append(fallback_segment) print(f"[LOG] Added fallback segment {segment_counter}") segment_counter += 1 part_time = time.time() - part_start_time print(f"[LOG] Part {i+1} processed in {part_time:.2f} seconds") total_time = time.time() - start_time print(f"[LOG] Total processing time: {total_time:.2f} seconds") print(f"[LOG] Generated {len(all_segments)} segments total") return all_segments def format_quiz_for_display(results): output = [] for segment in results: topic = segment["topic_name"] segment_num = segment["segment_number"] output.append(f"\n\n{'='*40}") output.append(f"SEGMENT {segment_num}: {topic}") output.append(f"{'='*40}\n") output.append("KEY CONCEPTS:") for concept in segment["key_concepts"]: output.append(f"• {concept}") output.append("\nSUMMARY:") output.append(segment["summary"]) output.append("\nQUIZ QUESTIONS:") for i, q in enumerate(segment["quiz_questions"]): output.append(f"\n{i+1}. {q['question']}") for j, option in enumerate(q['options']): letter = chr(97 + j).upper() correct_marker = " ✓" if option["correct"] else "" output.append(f" {letter}. {option['text']}{correct_marker}") return "\n".join(output) def save_results_as_json(results, filename="analysis_results.json"): with open(filename, "w", encoding="utf-8") as f: json.dump(results, f, indent=2, ensure_ascii=False) return filename def save_results_as_txt(formatted_text, filename="analysis_results.txt"): with open(filename, "w", encoding="utf-8") as f: f.write(formatted_text) return filename def analyze_document(document_text, api_key): print(f"[LOG] Starting document analysis...") overall_start_time = time.time() os.environ["GOOGLE_API_KEY"] = api_key try: results = process_document_with_quiz(document_text) formatted_output = format_quiz_for_display(results) json_path = "analysis_results.json" txt_path = "analysis_results.txt" with open(json_path, "w", encoding="utf-8") as f: json.dump(results, f, indent=2, ensure_ascii=False) with open(txt_path, "w", encoding="utf-8") as f: f.write(formatted_output) overall_time = time.time() - overall_start_time print(f"[LOG] Document analysis completed in {overall_time:.2f} seconds") topics_summary = "DOCUMENT ANALYSIS SUMMARY:\n" topics_summary += f"Total segments: {len(results)}\n" topics_summary += f"Processing time: {overall_time:.2f} seconds\n\n" topics_summary += "SEGMENTS:\n" for segment in results: topics_summary += f"- Segment {segment['segment_number']}: {segment['topic_name']}\n" formatted_output = topics_summary + "\n" + formatted_output return formatted_output, json_path, txt_path except Exception as e: error_msg = f"Error processing document: {str(e)}" print(f"[LOG] ERROR: {error_msg}") return error_msg, None, None with gr.Blocks(title="Quiz Generator") as app: gr.Markdown("# Quiz Generator") with gr.Row(): with gr.Column(): input_text = gr.Textbox( label="Input Document Text", placeholder="Paste your document text here...", lines=10 ) api_key = gr.Textbox( label="Gemini API Key", placeholder="Enter your Gemini API key", type="password" ) analyze_btn = gr.Button("Analyze Document") with gr.Column(): output_results = gr.Textbox( label="Analysis Results", lines=20 ) json_file_output = gr.File(label="Download JSON") txt_file_output = gr.File(label="Download TXT") analyze_btn.click( fn=analyze_document, inputs=[input_text, api_key], outputs=[output_results, json_file_output, txt_file_output] ) if __name__ == "__main__": app.launch()