Spaces:

MrSimple01
/

QuizGenerator

Sleeping

App Files Files Community

QuizGenerator / app.py

MrSimple01

Update app.py

88a62c0 verified 15 days ago

raw

history blame

11.8 kB

	import re
	import numpy as np
	import json
	from sentence_transformers import SentenceTransformer
	from transformers import AutoTokenizer
	from langchain_google_genai import ChatGoogleGenerativeAI
	import os
	import gradio as gr
	import time


	tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
	sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

	def clean_text(text):
	text = re.sub(r'\[speaker_\d+\]', '', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def split_text_by_tokens(text, max_tokens=8000):
	text = clean_text(text)
	tokens = tokenizer.encode(text)

	if len(tokens) <= max_tokens:
	return [text]

	split_point = len(tokens) // 2

	sentences = re.split(r'(?<=[.!?])\s+', text)

	first_half = []
	second_half = []

	current_tokens = 0
	for sentence in sentences:
	sentence_tokens = len(tokenizer.encode(sentence))

	if current_tokens + sentence_tokens <= split_point:
	first_half.append(sentence)
	current_tokens += sentence_tokens
	else:
	second_half.append(sentence)

	return [" ".join(first_half), " ".join(second_half)]

	def analyze_segment_with_gemini(segment_text):
	llm = ChatGoogleGenerativeAI(
	model="gemini-1.5-flash",
	temperature=0.7,
	max_tokens=None,
	timeout=None,
	max_retries=3
	)

	prompt = f"""
	Analyze the following text and identify distinct segments within it and do text segmentation:
	1. Segments should be STRICTLY max=15
	2. For each segment/topic you identify:
	- Provide a SPECIFIC and UNIQUE topic name (3-5 words) that clearly differentiates it from other segments
	- List 3-5 key concepts discussed in that segment (be precise and avoid repetition between segments)
	- Write a brief summary of that segment (3-5 sentences)
	- Create 5 high-quality, meaningful quiz questions based DIRECTLY on the content in that segment only
	- Questions and answers should be only from the content of the segment

	For each quiz question:
	- Create one correct answer that comes DIRECTLY from the text
	- Create two plausible but incorrect answers
	- IMPORTANT: Ensure all answer options have similar length (± 3 words)
	- Ensure the correct answer is clearly indicated with a ✓ symbol
	- Questions should require actual understanding, NOT just basic fact recall.
	- Questions Are non-trivial, encourage deeper thinking, and avoid surface-level facts.
	- Are directly based on the segment's content (not inferred from the summary).
	- Do not include questions about document structure (e.g., title, number of paragraphs).
	- Do not generate overly generic or obvious questions (e.g., "What is mentioned in the text?").
	- Focus on core ideas, logical reasoning, and conceptual understanding.

	ADDITIONAL REQUIREMENT:
	- First, detect the language of the original text.
	- Generate ALL output (topic names, key concepts, summaries, and quizzes) in the same language as the original text.
	- If the text is in Russian, generate all responses in Russian.
	- If the text is in another language, generate responses in that original language.


	Text:
	{segment_text}

	Format your response as JSON with the following structure:
	{{
	"segments": [
	{{
	"topic_name": "Unique and Specific Topic Name",
	"key_concepts": ["concept1", "concept2", "concept3"],
	"summary": "Brief summary of this segment.",
	"quiz_questions": [
	{{
	"question": "Question text?",
	"options": [
	{{
	"text": "Option A",
	"correct": false
	}},
	{{
	"text": "Option B",
	"correct": true
	}},
	{{
	"text": "Option C",
	"correct": false
	}}
	]
	}}
	]
	}}
	]
	}}

	IMPORTANT: Each segment must have a DISTINCT topic name that clearly differentiates it from others.
	- Do NOT repeat key concepts across multiple segments unless absolutely necessary.
	- Ensure the quiz questions challenge the reader and are not easily guessable.

	"""

	response = llm.invoke(prompt)
	response_text = response.content

	try:
	json_match = re.search(r'\{[\s\S]*\}', response_text)
	if json_match:
	return json.loads(json_match.group(0))
	else:
	return json.loads(response_text)
	except json.JSONDecodeError:
	return {
	"segments": [
	{
	"topic_name": "JSON Parsing Error",
	"key_concepts": ["Error in response format"],
	"summary": "Could not parse the API response.",
	"quiz_questions": []
	}
	]
	}

	def process_document_with_quiz(text):
	start_time = time.time()

	token_count = len(tokenizer.encode(text))
	print(f"[LOG] Total document tokens: {token_count}")

	if token_count > 8000:
	print(f"[LOG] Document exceeds 8000 tokens. Splitting into parts.")
	parts = split_text_by_tokens(text)
	print(f"[LOG] Document split into {len(parts)} parts")

	for i, part in enumerate(parts):
	part_tokens = len(tokenizer.encode(part))
	print(f"[LOG] Part {i+1} contains {part_tokens} tokens")
	else:
	print(f"[LOG] Document under 8000 tokens. Processing as a single part.")
	parts = [text]

	all_segments = []
	segment_counter = 1

	for i, part in enumerate(parts):
	part_start_time = time.time()
	print(f"[LOG] Processing part {i+1}...")

	analysis = analyze_segment_with_gemini(part)

	if "segments" in analysis:
	print(f"[LOG] Found {len(analysis['segments'])} segments in part {i+1}")

	for segment in analysis["segments"]:
	segment["segment_number"] = segment_counter
	all_segments.append(segment)
	print(f"[LOG] Segment {segment_counter}: {segment['topic_name']}")
	segment_counter += 1
	else:
	# Fallback if response format is unexpected
	print(f"[LOG] Error: Unexpected format in part {i+1} analysis")
	fallback_segment = {
	"topic_name": f"Segment {segment_counter} Analysis",
	"key_concepts": ["Format error in analysis"],
	"summary": "Could not properly segment this part of the text.",
	"quiz_questions": [],
	"segment_number": segment_counter
	}
	all_segments.append(fallback_segment)
	print(f"[LOG] Added fallback segment {segment_counter}")
	segment_counter += 1

	part_time = time.time() - part_start_time
	print(f"[LOG] Part {i+1} processed in {part_time:.2f} seconds")

	total_time = time.time() - start_time
	print(f"[LOG] Total processing time: {total_time:.2f} seconds")
	print(f"[LOG] Generated {len(all_segments)} segments total")

	return all_segments


	def format_quiz_for_display(results):
	output = []

	for segment in results:
	topic = segment["topic_name"]
	segment_num = segment["segment_number"]

	output.append(f"\n\n{'='*40}")
	output.append(f"SEGMENT {segment_num}: {topic}")
	output.append(f"{'='*40}\n")

	output.append("KEY CONCEPTS:")
	for concept in segment["key_concepts"]:
	output.append(f"• {concept}")

	output.append("\nSUMMARY:")
	output.append(segment["summary"])

	output.append("\nQUIZ QUESTIONS:")
	for i, q in enumerate(segment["quiz_questions"]):
	output.append(f"\n{i+1}. {q['question']}")

	for j, option in enumerate(q['options']):
	letter = chr(97 + j).upper()
	correct_marker = " ✓" if option["correct"] else ""
	output.append(f" {letter}. {option['text']}{correct_marker}")

	return "\n".join(output)

	def save_results_as_json(results, filename="analysis_results.json"):
	with open(filename, "w", encoding="utf-8") as f:
	json.dump(results, f, indent=2, ensure_ascii=False)
	return filename

	def save_results_as_txt(formatted_text, filename="analysis_results.txt"):
	with open(filename, "w", encoding="utf-8") as f:
	f.write(formatted_text)
	return filename


	def analyze_document(document_text, api_key):
	print(f"[LOG] Starting document analysis...")
	overall_start_time = time.time()

	os.environ["GOOGLE_API_KEY"] = api_key
	try:
	results = process_document_with_quiz(document_text)
	formatted_output = format_quiz_for_display(results)

	json_path = "analysis_results.json"
	txt_path = "analysis_results.txt"

	with open(json_path, "w", encoding="utf-8") as f:
	json.dump(results, f, indent=2, ensure_ascii=False)

	with open(txt_path, "w", encoding="utf-8") as f:
	f.write(formatted_output)

	overall_time = time.time() - overall_start_time
	print(f"[LOG] Document analysis completed in {overall_time:.2f} seconds")

	topics_summary = "DOCUMENT ANALYSIS SUMMARY:\n"
	topics_summary += f"Total segments: {len(results)}\n"
	topics_summary += f"Processing time: {overall_time:.2f} seconds\n\n"
	topics_summary += "SEGMENTS:\n"

	for segment in results:
	topics_summary += f"- Segment {segment['segment_number']}: {segment['topic_name']}\n"

	formatted_output = topics_summary + "\n" + formatted_output

	return formatted_output, json_path, txt_path
	except Exception as e:
	error_msg = f"Error processing document: {str(e)}"
	print(f"[LOG] ERROR: {error_msg}")
	return error_msg, None, None

	with gr.Blocks(title="Quiz Generator") as app:
	gr.Markdown("# Quiz Generator")

	with gr.Row():
	with gr.Column():
	input_text = gr.Textbox(
	label="Input Document Text",
	placeholder="Paste your document text here...",
	lines=10
	)

	api_key = gr.Textbox(
	label="Gemini API Key",
	placeholder="Enter your Gemini API key",
	type="password"
	)

	analyze_btn = gr.Button("Analyze Document")

	with gr.Column():
	output_results = gr.Textbox(
	label="Analysis Results",
	lines=20
	)
	json_file_output = gr.File(label="Download JSON")
	txt_file_output = gr.File(label="Download TXT")

	analyze_btn.click(
	fn=analyze_document,
	inputs=[input_text, api_key],
	outputs=[output_results, json_file_output, txt_file_output]
	)

	if __name__ == "__main__":
	app.launch()