MrSimple01 commited on
Commit
f3c5ac6
·
verified ·
1 Parent(s): 2876d5f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +233 -201
app.py CHANGED
@@ -1,16 +1,56 @@
 
1
  import re
2
- import numpy as np
3
  import json
4
- from sentence_transformers import SentenceTransformer
5
- from transformers import AutoTokenizer
6
- from langchain_google_genai import ChatGoogleGenerativeAI
7
- import os
8
- import gradio as gr
9
  import time
 
 
 
 
 
 
 
 
 
 
 
 
10
 
 
11
 
12
- tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
13
- sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  def clean_text(text):
16
  text = re.sub(r'\[speaker_\d+\]', '', text)
@@ -43,243 +83,235 @@ def split_text_by_tokens(text, max_tokens=8000):
43
 
44
  return [" ".join(first_half), " ".join(second_half)]
45
 
46
- def analyze_segment_with_gemini(segment_text):
47
- llm = ChatGoogleGenerativeAI(
48
- model="gemini-1.5-flash",
49
- temperature=0.7,
50
- max_tokens=None,
51
- timeout=None,
52
- max_retries=3
53
- )
54
-
55
- prompt = f"""
56
- Analyze the following text and identify distinct segments within it and do text segmentation:
57
- 1. Segments should be STRICTLY max=15
58
- 2. For each segment/topic you identify:
59
- - Provide a SPECIFIC and UNIQUE topic name (3-5 words) that clearly differentiates it from other segments
60
- - List 3-5 key concepts discussed in that segment (be precise and avoid repetition between segments)
61
- - Write a brief summary of that segment (3-5 sentences)
62
- - Create 5 high-quality, meaningful quiz questions based DIRECTLY on the content in that segment only
63
- - Questions and answers should be only from the content of the segment
64
-
65
- For each quiz question:
66
- - Create one correct answer that comes DIRECTLY from the text
67
- - Create two plausible but incorrect answers
68
- - IMPORTANT: Ensure all answer options have similar length (± 3 words)
69
- - Ensure the correct answer is clearly indicated with a ✓ symbol
70
- - Questions should **require actual understanding**, NOT just basic fact recall.
71
- - Questions Are **non-trivial**, encourage deeper thinking, and **avoid surface-level facts**.
72
- - Are **directly based on the segment's content** (not inferred from the summary).
73
- - Do **not include questions about document structure** (e.g., title, number of paragraphs).
74
- - Do **not generate overly generic or obvious questions** (e.g., "What is mentioned in the text?").
75
- - Focus on **core ideas, logical reasoning, and conceptual understanding**.
76
-
77
- ADDITIONAL REQUIREMENT:
78
- - **First, detect the language of the original text.**
79
- - **Generate ALL output (topic names, key concepts, summaries, and quizzes) in the same language as the original text.**
80
- - If the text is in Russian, generate all responses in Russian.
81
- - If the text is in another language, generate responses in that original language.
82
-
83
-
84
- Text:
85
- {segment_text}
86
 
87
- Format your response as JSON with the following structure:
88
- {{
89
- "segments": [
90
- {{
91
- "topic_name": "Unique and Specific Topic Name",
92
- "key_concepts": ["concept1", "concept2", "concept3"],
93
- "summary": "Brief summary of this segment.",
94
- "quiz_questions": [
95
- {{
96
- "question": "Question text?",
97
- "options": [
98
- {{
99
- "text": "Option A",
100
- "correct": false
101
- }},
102
- {{
103
- "text": "Option B",
104
- "correct": true
105
- }},
106
- {{
107
- "text": "Option C",
108
- "correct": false
109
- }}
110
- ]
111
- }}
112
- ]
113
- }}
114
- ]
115
- }}
116
 
117
- IMPORTANT: Each segment must have a DISTINCT topic name that clearly differentiates it from others.
118
- - **Do NOT repeat** key concepts across multiple segments unless absolutely necessary.
119
- - **Ensure the quiz questions challenge the reader** and **are not easily guessable**.
120
 
121
- """
122
 
123
- response = llm.invoke(prompt)
124
- response_text = response.content
125
 
126
- try:
127
- json_match = re.search(r'\{[\s\S]*\}', response_text)
128
- if json_match:
129
- return json.loads(json_match.group(0))
130
- else:
131
- return json.loads(response_text)
132
- except json.JSONDecodeError:
133
- return {
134
- "segments": [
135
- {
136
- "topic_name": "JSON Parsing Error",
137
- "key_concepts": ["Error in response format"],
138
- "summary": "Could not parse the API response.",
139
- "quiz_questions": []
140
- }
141
- ]
142
  }
143
-
144
- def process_document_with_quiz(text):
145
- start_time = time.time()
146
 
147
- token_count = len(tokenizer.encode(text))
148
- print(f"[LOG] Total document tokens: {token_count}")
149
 
150
- if token_count > 8000:
151
- print(f"[LOG] Document exceeds 8000 tokens. Splitting into parts.")
152
- parts = split_text_by_tokens(text)
153
- print(f"[LOG] Document split into {len(parts)} parts")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
- for i, part in enumerate(parts):
156
- part_tokens = len(tokenizer.encode(part))
157
- print(f"[LOG] Part {i+1} contains {part_tokens} tokens")
158
- else:
159
- print(f"[LOG] Document under 8000 tokens. Processing as a single part.")
160
- parts = [text]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
- all_segments = []
163
- segment_counter = 1
164
 
165
- for i, part in enumerate(parts):
166
- part_start_time = time.time()
167
- print(f"[LOG] Processing part {i+1}...")
 
 
 
 
 
 
 
 
 
168
 
169
- analysis = analyze_segment_with_gemini(part)
170
 
171
- if "segments" in analysis:
172
- print(f"[LOG] Found {len(analysis['segments'])} segments in part {i+1}")
 
173
 
174
- for segment in analysis["segments"]:
175
- segment["segment_number"] = segment_counter
176
- all_segments.append(segment)
177
- print(f"[LOG] Segment {segment_counter}: {segment['topic_name']}")
178
- segment_counter += 1
179
- else:
180
- # Fallback if response format is unexpected
181
- print(f"[LOG] Error: Unexpected format in part {i+1} analysis")
182
- fallback_segment = {
183
- "topic_name": f"Segment {segment_counter} Analysis",
184
- "key_concepts": ["Format error in analysis"],
185
- "summary": "Could not properly segment this part of the text.",
186
- "quiz_questions": [],
187
- "segment_number": segment_counter
188
- }
189
- all_segments.append(fallback_segment)
190
- print(f"[LOG] Added fallback segment {segment_counter}")
191
- segment_counter += 1
192
 
193
- part_time = time.time() - part_start_time
194
- print(f"[LOG] Part {i+1} processed in {part_time:.2f} seconds")
195
-
196
- total_time = time.time() - start_time
197
- print(f"[LOG] Total processing time: {total_time:.2f} seconds")
198
- print(f"[LOG] Generated {len(all_segments)} segments total")
199
-
200
- return all_segments
201
-
202
 
203
  def format_quiz_for_display(results):
204
  output = []
205
 
206
- for segment in results:
 
 
 
 
 
 
 
 
 
207
  topic = segment["topic_name"]
208
- segment_num = segment["segment_number"]
209
-
210
  output.append(f"\n\n{'='*40}")
211
  output.append(f"SEGMENT {segment_num}: {topic}")
212
  output.append(f"{'='*40}\n")
213
-
214
  output.append("KEY CONCEPTS:")
215
  for concept in segment["key_concepts"]:
216
  output.append(f"• {concept}")
217
-
218
  output.append("\nSUMMARY:")
219
  output.append(segment["summary"])
220
-
221
  output.append("\nQUIZ QUESTIONS:")
222
  for i, q in enumerate(segment["quiz_questions"]):
223
  output.append(f"\n{i+1}. {q['question']}")
224
-
225
  for j, option in enumerate(q['options']):
226
- letter = chr(97 + j).upper()
227
  correct_marker = " ✓" if option["correct"] else ""
228
  output.append(f" {letter}. {option['text']}{correct_marker}")
229
-
230
  return "\n".join(output)
231
 
232
- def save_results_as_json(results, filename="analysis_results.json"):
233
- with open(filename, "w", encoding="utf-8") as f:
234
- json.dump(results, f, indent=2, ensure_ascii=False)
235
- return filename
236
-
237
- def save_results_as_txt(formatted_text, filename="analysis_results.txt"):
238
- with open(filename, "w", encoding="utf-8") as f:
239
- f.write(formatted_text)
240
- return filename
241
-
242
-
243
- def analyze_document(document_text, api_key):
244
- print(f"[LOG] Starting document analysis...")
245
- overall_start_time = time.time()
246
-
247
- os.environ["GOOGLE_API_KEY"] = api_key
248
  try:
249
- results = process_document_with_quiz(document_text)
250
- formatted_output = format_quiz_for_display(results)
251
-
252
- json_path = "analysis_results.json"
253
- txt_path = "analysis_results.txt"
254
 
255
- with open(json_path, "w", encoding="utf-8") as f:
256
- json.dump(results, f, indent=2, ensure_ascii=False)
257
 
258
- with open(txt_path, "w", encoding="utf-8") as f:
259
- f.write(formatted_output)
 
 
 
 
 
 
 
260
 
261
- overall_time = time.time() - overall_start_time
262
- print(f"[LOG] Document analysis completed in {overall_time:.2f} seconds")
 
 
 
 
 
 
 
 
 
 
 
 
 
263
 
264
- topics_summary = "DOCUMENT ANALYSIS SUMMARY:\n"
265
- topics_summary += f"Total segments: {len(results)}\n"
266
- topics_summary += f"Processing time: {overall_time:.2f} seconds\n\n"
267
- topics_summary += "SEGMENTS:\n"
268
 
269
- for segment in results:
270
- topics_summary += f"- Segment {segment['segment_number']}: {segment['topic_name']}\n"
 
271
 
272
- formatted_output = topics_summary + "\n" + formatted_output
 
 
 
273
 
274
- return formatted_output, json_path, txt_path
 
 
 
 
275
  except Exception as e:
276
- error_msg = f"Error processing document: {str(e)}"
277
- print(f"[LOG] ERROR: {error_msg}")
278
- return error_msg, None, None
279
 
280
  with gr.Blocks(title="Quiz Generator") as app:
281
  gr.Markdown("# Quiz Generator")
282
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  with gr.Row():
284
  with gr.Column():
285
  input_text = gr.Textbox(
@@ -289,8 +321,8 @@ with gr.Blocks(title="Quiz Generator") as app:
289
  )
290
 
291
  api_key = gr.Textbox(
292
- label="Gemini API Key",
293
- placeholder="Enter your Gemini API key",
294
  type="password"
295
  )
296
 
@@ -306,9 +338,9 @@ with gr.Blocks(title="Quiz Generator") as app:
306
 
307
  analyze_btn.click(
308
  fn=analyze_document,
309
- inputs=[input_text, api_key],
310
  outputs=[output_results, json_file_output, txt_file_output]
311
  )
312
-
313
  if __name__ == "__main__":
314
  app.launch()
 
1
+ import os
2
  import re
 
3
  import json
 
 
 
 
 
4
  import time
5
+ import gradio as gr
6
+ import tempfile
7
+ from typing import Dict, Any, List, Optional
8
+ from transformers import AutoTokenizer
9
+ from sentence_transformers import SentenceTransformer
10
+ from pydantic import BaseModel, Field
11
+ from anthropic import Anthropic
12
+
13
+
14
+ CLAUDE_MODEL = "claude-3-5-sonnet-20241022"
15
+ OPENAI_MODEL = "gpt-4o"
16
+ GEMINI_MODEL = "gemini-2.0-flash"
17
 
18
+ DEFAULT_TEMPERATURE = 0.7
19
 
20
+ TOKENIZER_MODEL = "answerdotai/ModernBERT-base"
21
+ SENTENCE_TRANSFORMER_MODEL = "all-MiniLM-L6-v2"
22
+
23
+ class CourseInfo(BaseModel):
24
+ course_name: str = Field(description="Name of the course")
25
+ section_name: str = Field(description="Name of the course section")
26
+ lesson_name: str = Field(description="Name of the lesson")
27
+
28
+ class QuizOption(BaseModel):
29
+ text: str = Field(description="The text of the answer option")
30
+ correct: bool = Field(description="Whether this option is correct")
31
+
32
+ class QuizQuestion(BaseModel):
33
+ question: str = Field(description="The text of the quiz question")
34
+ options: List[QuizOption] = Field(description="List of answer options")
35
+
36
+ class Segment(BaseModel):
37
+ segment_number: int = Field(description="The segment number")
38
+ topic_name: str = Field(description="Unique and specific topic name that clearly differentiates it from other segments")
39
+ key_concepts: List[str] = Field(description="3-5 key concepts discussed in the segment")
40
+ summary: str = Field(description="Brief summary of the segment (3-5 sentences)")
41
+ quiz_questions: List[QuizQuestion] = Field(description="5 quiz questions based on the segment content")
42
+
43
+ class TextSegmentAnalysis(BaseModel):
44
+ course_info: CourseInfo = Field(description="Information about the course")
45
+ segments: List[Segment] = Field(description="List of text segments with analysis")
46
+
47
+
48
+ tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_MODEL)
49
+ sentence_model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
50
+
51
+ # System prompt
52
+ system_prompt = """You are an expert educational content analyzer. Your task is to analyze text content,
53
+ identify distinct segments, and create high-quality educational quiz questions for each segment."""
54
 
55
  def clean_text(text):
56
  text = re.sub(r'\[speaker_\d+\]', '', text)
 
83
 
84
  return [" ".join(first_half), " ".join(second_half)]
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
+ def generate_with_claude(text, api_key, course_name="", section_name="", lesson_name=""):
89
+ from prompts import SYSTEM_PROMPT, ANALYSIS_PROMPT_TEMPLATE_CLAUDE
 
90
 
91
+ client = Anthropic(api_key=api_key)
92
 
93
+ segment_analysis_schema = TextSegmentAnalysis.model_json_schema()
 
94
 
95
+ tools = [
96
+ {
97
+ "name": "build_segment_analysis",
98
+ "description": "Build the text segment analysis with quiz questions",
99
+ "input_schema": segment_analysis_schema
 
 
 
 
 
 
 
 
 
 
 
100
  }
101
+ ]
 
 
102
 
103
+ system_prompt = """You are a helpful assistant specialized in text analysis and educational content creation.
104
+ You analyze texts to identify distinct segments, create summaries, and generate quiz questions."""
105
 
106
+ prompt = prompt = ANALYSIS_PROMPT_TEMPLATE_CLAUDE.format(
107
+ course_name=course_name,
108
+ section_name=section_name,
109
+ lesson_name=lesson_name,
110
+ text=text
111
+ )
112
+
113
+ try:
114
+ response = client.messages.create(
115
+ model=CLAUDE_MODEL,
116
+ max_tokens=8192,
117
+ temperature=DEFAULT_TEMPERATURE,
118
+ system=system_prompt,
119
+ messages=[
120
+ {
121
+ "role": "user",
122
+ "content": prompt
123
+ }
124
+ ],
125
+ tools=tools,
126
+ tool_choice={"type": "tool", "name": "build_segment_analysis"}
127
+ )
128
 
129
+ # Extract the tool call content
130
+ if response.content and len(response.content) > 0 and hasattr(response.content[0], 'input'):
131
+ function_call = response.content[0].input
132
+ return function_call
133
+ else:
134
+ raise Exception("No valid tool call found in the response")
135
+ except Exception as e:
136
+ raise Exception(f"Error calling Anthropic API: {str(e)}")
137
+
138
+
139
+
140
+
141
+ def get_llm_by_api_key(api_key):
142
+ if api_key.startswith("sk-ant-"): # Claude API key format
143
+ from langchain_anthropic import ChatAnthropic
144
+ return ChatAnthropic(
145
+ anthropic_api_key=api_key,
146
+ model_name=CLAUDE_MODEL,
147
+ temperature=DEFAULT_TEMPERATURE,
148
+ max_retries=3
149
+ )
150
+ elif api_key.startswith("sk-"): # OpenAI API key format
151
+ from langchain_openai import ChatOpenAI
152
+ return ChatOpenAI(
153
+ openai_api_key=api_key,
154
+ model_name=OPENAI_MODEL,
155
+ temperature=DEFAULT_TEMPERATURE,
156
+ max_retries=3
157
+ )
158
+ else: # Default to Gemini
159
+ from langchain_google_genai import ChatGoogleGenerativeAI
160
+ os.environ["GOOGLE_API_KEY"] = api_key
161
+ return ChatGoogleGenerativeAI(
162
+ model=GEMINI_MODEL,
163
+ temperature=DEFAULT_TEMPERATURE,
164
+ max_retries=3
165
+ )
166
+
167
+ def segment_and_analyze_text(text: str, api_key: str, course_name="", section_name="", lesson_name="") -> Dict[str, Any]:
168
+ from prompts import SYSTEM_PROMPT, ANALYSIS_PROMPT_TEMPLATE_GEMINI
169
+ if api_key.startswith("sk-ant-"):
170
+ return generate_with_claude(text, api_key, course_name, section_name, lesson_name)
171
 
172
+ # For other models, use LangChain
173
+ llm = get_llm_by_api_key(api_key)
174
 
175
+ prompt = ANALYSIS_PROMPT_TEMPLATE_GEMINI.format(
176
+ course_name=course_name,
177
+ section_name=section_name,
178
+ lesson_name=lesson_name,
179
+ text=text
180
+ )
181
+
182
+ try:
183
+ messages = [
184
+ {"role": "system", "content": system_prompt},
185
+ {"role": "user", "content": prompt}
186
+ ]
187
 
188
+ response = llm.invoke(messages)
189
 
190
+ try:
191
+ content = response.content
192
+ json_match = re.search(r'```json\s*([\s\S]*?)\s*```', content)
193
 
194
+ if json_match:
195
+ json_str = json_match.group(1)
196
+ else:
197
+ json_match = re.search(r'(\{[\s\S]*\})', content)
198
+ if json_match:
199
+ json_str = json_match.group(1)
200
+ else:
201
+ json_str = content
 
 
 
 
 
 
 
 
 
 
202
 
203
+ # Parse the JSON
204
+ function_call = json.loads(json_str)
205
+ return function_call
206
+ except json.JSONDecodeError:
207
+ raise Exception("Could not parse JSON from LLM response")
208
+ except Exception as e:
209
+ raise Exception(f"Error calling API: {str(e)}")
 
 
210
 
211
  def format_quiz_for_display(results):
212
  output = []
213
 
214
+ if "course_info" in results:
215
+ course_info = results["course_info"]
216
+ output.append(f"{'='*40}")
217
+ output.append(f"COURSE: {course_info.get('course_name', 'N/A')}")
218
+ output.append(f"SECTION: {course_info.get('section_name', 'N/A')}")
219
+ output.append(f"LESSON: {course_info.get('lesson_name', 'N/A')}")
220
+ output.append(f"{'='*40}\n")
221
+
222
+ segments = results.get("segments", [])
223
+ for i, segment in enumerate(segments):
224
  topic = segment["topic_name"]
225
+ segment_num = i + 1
 
226
  output.append(f"\n\n{'='*40}")
227
  output.append(f"SEGMENT {segment_num}: {topic}")
228
  output.append(f"{'='*40}\n")
 
229
  output.append("KEY CONCEPTS:")
230
  for concept in segment["key_concepts"]:
231
  output.append(f"• {concept}")
 
232
  output.append("\nSUMMARY:")
233
  output.append(segment["summary"])
 
234
  output.append("\nQUIZ QUESTIONS:")
235
  for i, q in enumerate(segment["quiz_questions"]):
236
  output.append(f"\n{i+1}. {q['question']}")
 
237
  for j, option in enumerate(q['options']):
238
+ letter = chr(97 + j).upper()
239
  correct_marker = " ✓" if option["correct"] else ""
240
  output.append(f" {letter}. {option['text']}{correct_marker}")
 
241
  return "\n".join(output)
242
 
243
+ def analyze_document(text, api_key, course_name, section_name, lesson_name):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  try:
245
+ start_time = time.time()
 
 
 
 
246
 
247
+ # Split text if it's too long
248
+ text_parts = split_text_by_tokens(text)
249
 
250
+ all_results = {
251
+ "course_info": {
252
+ "course_name": course_name,
253
+ "section_name": section_name,
254
+ "lesson_name": lesson_name
255
+ },
256
+ "segments": []
257
+ }
258
+ segment_counter = 1
259
 
260
+ # Process each part of the text
261
+ for part in text_parts:
262
+ analysis = segment_and_analyze_text(
263
+ part,
264
+ api_key,
265
+ course_name=course_name,
266
+ section_name=section_name,
267
+ lesson_name=lesson_name
268
+ )
269
+
270
+ if "segments" in analysis:
271
+ for segment in analysis["segments"]:
272
+ segment["segment_number"] = segment_counter
273
+ all_results["segments"].append(segment)
274
+ segment_counter += 1
275
 
276
+ end_time = time.time()
277
+ total_time = end_time - start_time
 
 
278
 
279
+ # Format the results for display
280
+ formatted_text = format_quiz_for_display(all_results)
281
+ formatted_text = f"Total processing time: {total_time:.2f} seconds\n\n" + formatted_text
282
 
283
+ # Create temporary files for JSON and text output
284
+ json_path = tempfile.mktemp(suffix='.json')
285
+ with open(json_path, 'w', encoding='utf-8') as json_file:
286
+ json.dump(all_results, json_file, indent=2)
287
 
288
+ txt_path = tempfile.mktemp(suffix='.txt')
289
+ with open(txt_path, 'w', encoding='utf-8') as txt_file:
290
+ txt_file.write(formatted_text)
291
+
292
+ return formatted_text, json_path, txt_path
293
  except Exception as e:
294
+ error_message = f"Error processing document: {str(e)}"
295
+ return error_message, None, None
 
296
 
297
  with gr.Blocks(title="Quiz Generator") as app:
298
  gr.Markdown("# Quiz Generator")
299
+
300
+ with gr.Row():
301
+ with gr.Column():
302
+ course_name = gr.Textbox(
303
+ placeholder="Enter the course name",
304
+ label="Course Name"
305
+ )
306
+ section_name = gr.Textbox(
307
+ placeholder="Enter the section name",
308
+ label="Section Name"
309
+ )
310
+ lesson_name = gr.Textbox(
311
+ placeholder="Enter the lesson name",
312
+ label="Lesson Name"
313
+ )
314
+
315
  with gr.Row():
316
  with gr.Column():
317
  input_text = gr.Textbox(
 
321
  )
322
 
323
  api_key = gr.Textbox(
324
+ label="API Key",
325
+ placeholder="Enter your OpenAI, Claude, or Gemini API key",
326
  type="password"
327
  )
328
 
 
338
 
339
  analyze_btn.click(
340
  fn=analyze_document,
341
+ inputs=[input_text, api_key, course_name, section_name, lesson_name],
342
  outputs=[output_results, json_file_output, txt_file_output]
343
  )
344
+
345
  if __name__ == "__main__":
346
  app.launch()