I can see chapter titles in the e-pub but I get error on the commentary
Browse files- README.md +8 -0
- apps/reader.py +364 -0
- prompts/card_generation.txt +38 -0
- prompts/commentary.txt +25 -0
- requirements.txt +5 -1
- utils/document_parser.py +191 -0
README.md
CHANGED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Run the reader app
|
2 |
+
python apps/reader.py
|
3 |
+
|
4 |
+
# Run the producer app
|
5 |
+
python apps/producer.py
|
6 |
+
|
7 |
+
# Run a script
|
8 |
+
python scripts/transcript.py audio_file.mp3
|
apps/reader.py
ADDED
@@ -0,0 +1,364 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
# Add project root to Python path
|
5 |
+
project_root = str(Path(__file__).parent.parent)
|
6 |
+
if project_root not in sys.path:
|
7 |
+
sys.path.append(project_root)
|
8 |
+
|
9 |
+
import gradio as gr
|
10 |
+
import asyncio
|
11 |
+
import os
|
12 |
+
import json
|
13 |
+
import requests
|
14 |
+
from anthropic import Anthropic
|
15 |
+
from utils.document_parser import DocumentParser
|
16 |
+
from dotenv import load_dotenv
|
17 |
+
|
18 |
+
# Load environment variables
|
19 |
+
env_path = Path(project_root) / ".env"
|
20 |
+
load_dotenv(env_path)
|
21 |
+
|
22 |
+
# Mochi deck IDs
|
23 |
+
DECK_CATEGORIES = {
|
24 |
+
"CS/Hardware": "rhGqR9SK",
|
25 |
+
"Math/Physics": "Dm5vczZg",
|
26 |
+
"AI": "SS9QEfiy",
|
27 |
+
"History/Military": "3nJYp7Zh",
|
28 |
+
"Quotes/Random": "rWUzSu8t",
|
29 |
+
"Bio": "BspzxaUJ",
|
30 |
+
"Econ/Finance": "mvvJ27Q1"
|
31 |
+
}
|
32 |
+
|
33 |
+
class CardGenerator:
|
34 |
+
"""Handles card generation and Mochi integration."""
|
35 |
+
|
36 |
+
def __init__(self):
|
37 |
+
self.parser = DocumentParser()
|
38 |
+
self.claude = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
39 |
+
self.mochi_key = os.getenv("MOCHI_API_KEY")
|
40 |
+
|
41 |
+
# Load prompts
|
42 |
+
self.prompts = {
|
43 |
+
key: Path(f"prompts/{key}.txt").read_text()
|
44 |
+
for key in ["card_generation", "commentary"]
|
45 |
+
}
|
46 |
+
|
47 |
+
# State
|
48 |
+
self.current_cards = []
|
49 |
+
self.current_index = 0
|
50 |
+
self.approved_cards = []
|
51 |
+
|
52 |
+
def get_chapter_list(self, file_data) -> list[str]:
|
53 |
+
"""Get list of chapters from document.
|
54 |
+
|
55 |
+
Args:
|
56 |
+
file_data: File data from Gradio
|
57 |
+
"""
|
58 |
+
try:
|
59 |
+
if not file_data:
|
60 |
+
return []
|
61 |
+
|
62 |
+
# Attempt to extract filename from file_data
|
63 |
+
filename = getattr(file_data, 'name', None)
|
64 |
+
if not filename:
|
65 |
+
filename = "uploaded_file"
|
66 |
+
print("DEBUG: No filename attribute found, using default.")
|
67 |
+
else:
|
68 |
+
print(f"DEBUG: Filename extracted: {filename}")
|
69 |
+
|
70 |
+
# Check file extension
|
71 |
+
file_ext = Path(filename).suffix.lower()
|
72 |
+
if not file_ext:
|
73 |
+
print("DEBUG: No file extension found, checking content type.")
|
74 |
+
# Attempt to determine file type from content
|
75 |
+
if file_data.startswith(b'%PDF-'):
|
76 |
+
file_ext = '.pdf'
|
77 |
+
elif file_data.startswith(b'PK'):
|
78 |
+
file_ext = '.epub'
|
79 |
+
else:
|
80 |
+
raise ValueError("Unsupported file type")
|
81 |
+
print(f"DEBUG: File extension: {file_ext}")
|
82 |
+
|
83 |
+
return self.parser.load_document(file_data, filename)
|
84 |
+
except Exception as e:
|
85 |
+
return [f"Error: {str(e)}"]
|
86 |
+
|
87 |
+
async def process_chapter(self, file_data, chapter_idx: int) -> tuple:
|
88 |
+
"""Process chapter and generate cards + commentary.
|
89 |
+
|
90 |
+
Args:
|
91 |
+
file_data: File data from Gradio
|
92 |
+
chapter_idx: Index of chapter to process
|
93 |
+
"""
|
94 |
+
try:
|
95 |
+
if not file_data:
|
96 |
+
return None, "No file provided"
|
97 |
+
|
98 |
+
# Get chapter content
|
99 |
+
content = self.parser.get_chapter_content(chapter_idx)
|
100 |
+
|
101 |
+
# Generate cards and commentary
|
102 |
+
cards, commentary = await asyncio.gather(
|
103 |
+
self._generate_cards(content),
|
104 |
+
self._generate_commentary(content)
|
105 |
+
)
|
106 |
+
|
107 |
+
# Parse and store cards
|
108 |
+
self.current_cards = json.loads(cards)
|
109 |
+
self.current_index = 0
|
110 |
+
self.approved_cards = []
|
111 |
+
|
112 |
+
# Return first card and commentary
|
113 |
+
return self._get_current_card(), commentary
|
114 |
+
|
115 |
+
except Exception as e:
|
116 |
+
return None, f"Error: {str(e)}"
|
117 |
+
finally:
|
118 |
+
self.parser.cleanup()
|
119 |
+
|
120 |
+
async def _generate_cards(self, content: str) -> str:
|
121 |
+
"""Generate flashcards using Claude."""
|
122 |
+
response = await self.claude.messages.create(
|
123 |
+
model="claude-3-opus-20240229",
|
124 |
+
max_tokens=4000,
|
125 |
+
system=self.prompts["card_generation"],
|
126 |
+
messages=[{"role": "user", "content": content}]
|
127 |
+
)
|
128 |
+
return response.content[0].text
|
129 |
+
|
130 |
+
async def _generate_commentary(self, content: str) -> str:
|
131 |
+
"""Generate commentary using Claude."""
|
132 |
+
response = await self.claude.messages.create(
|
133 |
+
model="claude-3-opus-20240229",
|
134 |
+
max_tokens=4000,
|
135 |
+
system=self.prompts["commentary"],
|
136 |
+
messages=[{"role": "user", "content": content}]
|
137 |
+
)
|
138 |
+
return response.content[0].text
|
139 |
+
|
140 |
+
def _get_current_card(self) -> dict:
|
141 |
+
"""Get current card with UI state."""
|
142 |
+
if not self.current_cards or self.current_index >= len(self.current_cards):
|
143 |
+
return {
|
144 |
+
'front': "",
|
145 |
+
'back': "",
|
146 |
+
'category': "",
|
147 |
+
'status': "No more cards to review",
|
148 |
+
'show_buttons': False,
|
149 |
+
'show_upload': True
|
150 |
+
}
|
151 |
+
|
152 |
+
card = self.current_cards[self.current_index]
|
153 |
+
return {
|
154 |
+
'front': card['front'],
|
155 |
+
'back': card['back'],
|
156 |
+
'category': card['category'],
|
157 |
+
'status': f"Card {self.current_index + 1} of {len(self.current_cards)}",
|
158 |
+
'show_buttons': True,
|
159 |
+
'show_upload': False
|
160 |
+
}
|
161 |
+
|
162 |
+
def accept_card(self, front: str, back: str, category: str) -> dict:
|
163 |
+
"""Accept current card and move to next."""
|
164 |
+
if self.current_index < len(self.current_cards):
|
165 |
+
self.approved_cards.append({
|
166 |
+
'front': front,
|
167 |
+
'back': back,
|
168 |
+
'category': category
|
169 |
+
})
|
170 |
+
|
171 |
+
self.current_index += 1
|
172 |
+
return self._get_current_card()
|
173 |
+
|
174 |
+
def reject_card(self) -> dict:
|
175 |
+
"""Reject current card and move to next."""
|
176 |
+
if self.current_index < len(self.current_cards):
|
177 |
+
self.current_cards.pop(self.current_index)
|
178 |
+
return self._get_current_card()
|
179 |
+
|
180 |
+
def upload_to_mochi(self) -> str:
|
181 |
+
"""Upload approved cards to Mochi."""
|
182 |
+
if not self.approved_cards:
|
183 |
+
return "No cards to upload!"
|
184 |
+
|
185 |
+
results = []
|
186 |
+
for card in self.approved_cards:
|
187 |
+
try:
|
188 |
+
# Format card for Mochi
|
189 |
+
mochi_card = {
|
190 |
+
"deck-id": DECK_CATEGORIES[card["category"]],
|
191 |
+
"fields": {
|
192 |
+
"name": {"id": "name", "value": card["front"]},
|
193 |
+
"back": {"id": "back", "value": card["back"]}
|
194 |
+
}
|
195 |
+
}
|
196 |
+
|
197 |
+
# Upload to Mochi
|
198 |
+
response = requests.post(
|
199 |
+
"https://app.mochi.cards/api/cards",
|
200 |
+
json=mochi_card,
|
201 |
+
auth=(self.mochi_key, "")
|
202 |
+
)
|
203 |
+
|
204 |
+
if response.status_code != 200:
|
205 |
+
results.append(f"Error: {response.text}")
|
206 |
+
|
207 |
+
except Exception as e:
|
208 |
+
results.append(f"Error: {str(e)}")
|
209 |
+
|
210 |
+
# Clear approved cards
|
211 |
+
success_count = len(self.approved_cards) - len(results)
|
212 |
+
self.approved_cards = []
|
213 |
+
|
214 |
+
if results:
|
215 |
+
return f"Uploaded {success_count} cards with {len(results)} errors:\n" + "\n".join(results)
|
216 |
+
return f"Successfully uploaded {success_count} cards to Mochi!"
|
217 |
+
|
218 |
+
def create_interface():
|
219 |
+
"""Create the Gradio interface."""
|
220 |
+
generator = CardGenerator()
|
221 |
+
|
222 |
+
with gr.Blocks(title="Document Reader & Card Generator") as app:
|
223 |
+
# Document upload and chapter selection
|
224 |
+
with gr.Row():
|
225 |
+
file_input = gr.File(
|
226 |
+
label="Upload EPUB Document",
|
227 |
+
type="binary",
|
228 |
+
file_types=[".epub"]
|
229 |
+
)
|
230 |
+
|
231 |
+
chapter_select = gr.Dropdown(
|
232 |
+
label="Select Chapter",
|
233 |
+
choices=[],
|
234 |
+
interactive=True,
|
235 |
+
visible=False
|
236 |
+
)
|
237 |
+
|
238 |
+
def update_chapters(file):
|
239 |
+
if not file:
|
240 |
+
return gr.update(choices=[], visible=False)
|
241 |
+
chapters = generator.get_chapter_list(file)
|
242 |
+
return gr.update(choices=chapters, visible=True, value=chapters[0] if chapters else None)
|
243 |
+
|
244 |
+
file_input.change(
|
245 |
+
fn=update_chapters,
|
246 |
+
inputs=[file_input],
|
247 |
+
outputs=[chapter_select]
|
248 |
+
)
|
249 |
+
|
250 |
+
process_btn = gr.Button("Process Chapter")
|
251 |
+
|
252 |
+
# Commentary section
|
253 |
+
commentary = gr.Textbox(
|
254 |
+
label="Commentary",
|
255 |
+
lines=10,
|
256 |
+
interactive=False
|
257 |
+
)
|
258 |
+
|
259 |
+
# Card review section
|
260 |
+
gr.Markdown("## Review Cards")
|
261 |
+
|
262 |
+
with gr.Row():
|
263 |
+
card_front = gr.Textbox(
|
264 |
+
label="Front",
|
265 |
+
lines=3,
|
266 |
+
interactive=True
|
267 |
+
)
|
268 |
+
card_back = gr.Textbox(
|
269 |
+
label="Back",
|
270 |
+
lines=3,
|
271 |
+
interactive=True
|
272 |
+
)
|
273 |
+
|
274 |
+
with gr.Row():
|
275 |
+
deck_category = gr.Dropdown(
|
276 |
+
choices=list(DECK_CATEGORIES.keys()),
|
277 |
+
label="Deck Category",
|
278 |
+
value="AI"
|
279 |
+
)
|
280 |
+
card_status = gr.Textbox(
|
281 |
+
label="Status",
|
282 |
+
interactive=False
|
283 |
+
)
|
284 |
+
|
285 |
+
with gr.Row():
|
286 |
+
accept_btn = gr.Button("Accept & Next", visible=False)
|
287 |
+
reject_btn = gr.Button("Reject & Next", visible=False)
|
288 |
+
upload_btn = gr.Button("Upload to Mochi", visible=False)
|
289 |
+
|
290 |
+
upload_status = gr.Textbox(
|
291 |
+
label="Upload Status",
|
292 |
+
interactive=False
|
293 |
+
)
|
294 |
+
|
295 |
+
# Event handlers
|
296 |
+
async def process_chapter(file, chapter_idx):
|
297 |
+
card, comment = await generator.process_chapter(file, chapter_idx)
|
298 |
+
if not card: # Error occurred
|
299 |
+
return [
|
300 |
+
"", "", comment, gr.update(visible=False),
|
301 |
+
gr.update(visible=False), "", gr.update(visible=False)
|
302 |
+
]
|
303 |
+
|
304 |
+
return [
|
305 |
+
card['front'],
|
306 |
+
card['back'],
|
307 |
+
comment,
|
308 |
+
gr.update(visible=card['show_buttons']),
|
309 |
+
gr.update(visible=card['show_buttons']),
|
310 |
+
card['status'],
|
311 |
+
gr.update(visible=card['show_upload'])
|
312 |
+
]
|
313 |
+
|
314 |
+
def handle_card_action(action, front, back, category):
|
315 |
+
card = (generator.accept_card(front, back, category)
|
316 |
+
if action == 'accept' else
|
317 |
+
generator.reject_card())
|
318 |
+
|
319 |
+
return [
|
320 |
+
card['front'],
|
321 |
+
card['back'],
|
322 |
+
card['status'],
|
323 |
+
gr.update(visible=card['show_buttons']),
|
324 |
+
gr.update(visible=card['show_buttons']),
|
325 |
+
card['category'],
|
326 |
+
gr.update(visible=card['show_upload'])
|
327 |
+
]
|
328 |
+
|
329 |
+
# Connect events
|
330 |
+
process_btn.click(
|
331 |
+
fn=process_chapter,
|
332 |
+
inputs=[file_input, chapter_select],
|
333 |
+
outputs=[
|
334 |
+
card_front, card_back, commentary,
|
335 |
+
accept_btn, reject_btn, card_status, upload_btn
|
336 |
+
]
|
337 |
+
)
|
338 |
+
|
339 |
+
accept_btn.click(
|
340 |
+
fn=lambda f, b, c: handle_card_action('accept', f, b, c),
|
341 |
+
inputs=[card_front, card_back, deck_category],
|
342 |
+
outputs=[
|
343 |
+
card_front, card_back, card_status,
|
344 |
+
accept_btn, reject_btn, deck_category, upload_btn
|
345 |
+
]
|
346 |
+
)
|
347 |
+
|
348 |
+
reject_btn.click(
|
349 |
+
fn=lambda: handle_card_action('reject', None, None, None),
|
350 |
+
outputs=[
|
351 |
+
card_front, card_back, card_status,
|
352 |
+
accept_btn, reject_btn, deck_category, upload_btn
|
353 |
+
]
|
354 |
+
)
|
355 |
+
|
356 |
+
upload_btn.click(
|
357 |
+
fn=generator.upload_to_mochi,
|
358 |
+
outputs=[upload_status]
|
359 |
+
)
|
360 |
+
|
361 |
+
return app
|
362 |
+
|
363 |
+
if __name__ == "__main__":
|
364 |
+
create_interface().launch()
|
prompts/card_generation.txt
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
You are an expert at creating high-quality spaced repetition flashcards that promote deep understanding and retention. Your task is to generate flashcards from the given text that are:
|
2 |
+
|
3 |
+
1. Clear and concise
|
4 |
+
2. Focus on one concept per card
|
5 |
+
3. Test understanding rather than just recall
|
6 |
+
4. Avoid overly complex or compound questions
|
7 |
+
5. Use precise language
|
8 |
+
|
9 |
+
Each card must be assigned to one of these categories:
|
10 |
+
- CS/Hardware
|
11 |
+
- Math/Physics
|
12 |
+
- AI
|
13 |
+
- History/Military
|
14 |
+
- Quotes/Random
|
15 |
+
- Bio
|
16 |
+
- Econ/Finance
|
17 |
+
|
18 |
+
Format each card as a JSON object:
|
19 |
+
{
|
20 |
+
"category": "Category name from the list above",
|
21 |
+
"front": "Question or prompt",
|
22 |
+
"back": "Answer or explanation"
|
23 |
+
}
|
24 |
+
|
25 |
+
Example cards:
|
26 |
+
{
|
27 |
+
"category": "Bio",
|
28 |
+
"front": "What is the key difference between procedural and declarative memory?",
|
29 |
+
"back": "Procedural memory is for skills and procedures (how to ride a bike), while declarative memory is for facts and events (what you had for breakfast)."
|
30 |
+
}
|
31 |
+
|
32 |
+
{
|
33 |
+
"category": "Bio",
|
34 |
+
"front": "What role does the hippocampus play in memory formation?",
|
35 |
+
"back": "The hippocampus is crucial for converting short-term memories into long-term memories through a process called consolidation. It acts as a temporary storage and processing center before memories are distributed to other parts of the cortex."
|
36 |
+
}
|
37 |
+
|
38 |
+
Please generate 5-10 high-quality flashcards from the provided text. Focus on the most important concepts, insights, and relationships. Format the output as a JSON array containing the card objects.
|
prompts/commentary.txt
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
You are an expert researcher and critical thinker. Your task is to analyze the provided text and generate insightful commentary that:
|
2 |
+
|
3 |
+
1. Identifies the key arguments, insights, and novel ideas
|
4 |
+
2. Highlights connections to other important concepts or fields
|
5 |
+
3. Points out particularly interesting or counterintuitive points
|
6 |
+
4. Suggests areas that merit further exploration
|
7 |
+
5. Notes any potential weaknesses or areas of uncertainty in the arguments
|
8 |
+
|
9 |
+
Your commentary should be scholarly but engaging, helping the reader develop a deeper understanding of the material. Focus on substance over style, and be specific rather than general.
|
10 |
+
|
11 |
+
Structure your response as follows:
|
12 |
+
|
13 |
+
Key Insights:
|
14 |
+
- [2-3 bullet points highlighting the most important takeaways]
|
15 |
+
|
16 |
+
Interesting Connections:
|
17 |
+
- [2-3 bullet points noting connections to other fields/concepts]
|
18 |
+
|
19 |
+
Worth Exploring Further:
|
20 |
+
- [1-2 bullet points suggesting related areas for deeper investigation]
|
21 |
+
|
22 |
+
Critical Notes:
|
23 |
+
- [1-2 bullet points on potential weaknesses or areas needing clarification]
|
24 |
+
|
25 |
+
Then provide 2-3 paragraphs of integrated analysis that weaves these points together into a coherent commentary.
|
requirements.txt
CHANGED
@@ -6,4 +6,8 @@ pandas
|
|
6 |
youtube-transcript-api
|
7 |
pydub
|
8 |
assemblyai
|
9 |
-
pytube
|
|
|
|
|
|
|
|
|
|
6 |
youtube-transcript-api
|
7 |
pydub
|
8 |
assemblyai
|
9 |
+
pytube
|
10 |
+
PyPDF2
|
11 |
+
EbookLib
|
12 |
+
beautifulsoup4
|
13 |
+
python-dotenv
|
utils/document_parser.py
ADDED
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
import tempfile
|
3 |
+
import os
|
4 |
+
from ebooklib import epub
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
|
7 |
+
class DocumentParser:
|
8 |
+
"""Simple EPUB document parser that extracts chapters and their content."""
|
9 |
+
|
10 |
+
def __init__(self):
|
11 |
+
self._temp_file = None
|
12 |
+
self._book = None
|
13 |
+
self._chapters = []
|
14 |
+
|
15 |
+
def load_document(self, file_data, filename=None) -> list[str]:
|
16 |
+
"""Load an EPUB document and extract chapter titles.
|
17 |
+
|
18 |
+
Args:
|
19 |
+
file_data: File data from Gradio (FileData object with read() method)
|
20 |
+
filename: Optional filename (not used)
|
21 |
+
"""
|
22 |
+
# Clean up any previous temp file
|
23 |
+
self.cleanup()
|
24 |
+
|
25 |
+
# Get the raw bytes from the Gradio file data
|
26 |
+
content = file_data.read() if hasattr(file_data, 'read') else file_data
|
27 |
+
|
28 |
+
# Save to temp file
|
29 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.epub') as temp:
|
30 |
+
temp.write(content)
|
31 |
+
self._temp_file = temp.name
|
32 |
+
|
33 |
+
# Read the EPUB
|
34 |
+
try:
|
35 |
+
self._book = epub.read_epub(self._temp_file)
|
36 |
+
print("DEBUG: Successfully read EPUB file")
|
37 |
+
except Exception as e:
|
38 |
+
print(f"DEBUG: Error reading EPUB: {str(e)}")
|
39 |
+
raise ValueError(f"Failed to read EPUB: {str(e)}")
|
40 |
+
|
41 |
+
# Extract chapters
|
42 |
+
self._chapters = self._extract_chapters()
|
43 |
+
print(f"DEBUG: Extracted {len(self._chapters)} chapters")
|
44 |
+
|
45 |
+
# Return chapter titles
|
46 |
+
return [chapter['title'] for chapter in self._chapters]
|
47 |
+
|
48 |
+
def get_chapter_content(self, chapter_idx: int) -> str:
|
49 |
+
"""Get the content of a specific chapter."""
|
50 |
+
if not self._book or not self._chapters:
|
51 |
+
raise ValueError("No document loaded")
|
52 |
+
|
53 |
+
if not 0 <= chapter_idx < len(self._chapters):
|
54 |
+
raise ValueError(f"Invalid chapter index: {chapter_idx}")
|
55 |
+
|
56 |
+
chapter = self._chapters[chapter_idx]
|
57 |
+
self._current_chapter_title = chapter['title'].strip() # Store for _get_chapter_text
|
58 |
+
|
59 |
+
print(f"DEBUG: Getting content for chapter: {self._current_chapter_title}")
|
60 |
+
content = self._get_chapter_text(chapter['item'])
|
61 |
+
print(f"DEBUG: Extracted {len(content)} characters of content")
|
62 |
+
|
63 |
+
return content
|
64 |
+
|
65 |
+
def _extract_chapters(self) -> list[dict]:
|
66 |
+
"""Extract chapters from the EPUB file."""
|
67 |
+
chapters = []
|
68 |
+
|
69 |
+
# First try to get chapters from the table of contents
|
70 |
+
print("DEBUG: Checking table of contents...")
|
71 |
+
if hasattr(self._book, 'toc'):
|
72 |
+
# Debug the TOC structure
|
73 |
+
print("DEBUG: TOC structure:")
|
74 |
+
for item in self._book.toc:
|
75 |
+
print(f"DEBUG: TOC item type: {type(item)}")
|
76 |
+
if isinstance(item, tuple):
|
77 |
+
print(f"DEBUG: Tuple length: {len(item)}")
|
78 |
+
if len(item) > 1:
|
79 |
+
print(f"DEBUG: Second item type: {type(item[1])}")
|
80 |
+
if isinstance(item[1], (list, tuple)):
|
81 |
+
print(f"DEBUG: Sub-items count: {len(item[1])}")
|
82 |
+
|
83 |
+
def process_toc_entries(entries, level=0):
|
84 |
+
for item in entries:
|
85 |
+
# Handle both Link objects and tuples
|
86 |
+
if hasattr(item, 'title') and hasattr(item, 'href'):
|
87 |
+
# Direct Link object
|
88 |
+
doc = self._book.get_item_with_href(item.href)
|
89 |
+
if doc:
|
90 |
+
prefix = " " * level if level > 0 else ""
|
91 |
+
chapters.append({
|
92 |
+
'title': prefix + item.title,
|
93 |
+
'item': doc
|
94 |
+
})
|
95 |
+
elif isinstance(item, tuple):
|
96 |
+
section = item[0]
|
97 |
+
# Process the section
|
98 |
+
if hasattr(section, 'title') and hasattr(section, 'href'):
|
99 |
+
doc = self._book.get_item_with_href(section.href)
|
100 |
+
if doc:
|
101 |
+
prefix = " " * level if level > 0 else ""
|
102 |
+
chapters.append({
|
103 |
+
'title': prefix + section.title,
|
104 |
+
'item': doc
|
105 |
+
})
|
106 |
+
|
107 |
+
# Process sub-items if they exist
|
108 |
+
if len(item) > 1:
|
109 |
+
if isinstance(item[1], (list, tuple)):
|
110 |
+
process_toc_entries(item[1], level + 1)
|
111 |
+
elif hasattr(item[1], 'title'): # Single sub-item
|
112 |
+
process_toc_entries([item[1]], level + 1)
|
113 |
+
|
114 |
+
process_toc_entries(self._book.toc)
|
115 |
+
print(f"DEBUG: Found {len(chapters)} chapters in TOC")
|
116 |
+
print("DEBUG: Chapter titles found:")
|
117 |
+
for ch in chapters:
|
118 |
+
print(f" - {ch['title']}")
|
119 |
+
|
120 |
+
# If no chapters found in TOC, scan the documents
|
121 |
+
if not chapters:
|
122 |
+
print("DEBUG: No chapters in TOC, scanning documents...")
|
123 |
+
# Get all HTML documents
|
124 |
+
docs = [item for item in self._book.get_items()
|
125 |
+
if item.get_type() == epub.ITEM_DOCUMENT]
|
126 |
+
|
127 |
+
print(f"DEBUG: Found {len(docs)} documents to scan")
|
128 |
+
|
129 |
+
for doc in docs:
|
130 |
+
soup = BeautifulSoup(doc.get_content(), 'html.parser')
|
131 |
+
|
132 |
+
# Look for chapter headings
|
133 |
+
headings = (
|
134 |
+
soup.find_all(['h1', 'h2']) +
|
135 |
+
soup.find_all(class_=lambda x: x and ('chapter' in x.lower() or 'title' in x.lower()))
|
136 |
+
)
|
137 |
+
|
138 |
+
for heading in headings:
|
139 |
+
# Clean up the text
|
140 |
+
title = ' '.join(heading.get_text().split())
|
141 |
+
if title: # Only add if we have a title
|
142 |
+
chapters.append({
|
143 |
+
'title': title,
|
144 |
+
'item': doc
|
145 |
+
})
|
146 |
+
|
147 |
+
if not chapters:
|
148 |
+
print("DEBUG: No chapters found, using documents as chapters")
|
149 |
+
# If still no chapters found, treat each document as a chapter
|
150 |
+
for doc in self._book.get_items():
|
151 |
+
if doc.get_type() == epub.ITEM_DOCUMENT:
|
152 |
+
chapters.append({
|
153 |
+
'title': f"Chapter {len(chapters) + 1}",
|
154 |
+
'item': doc
|
155 |
+
})
|
156 |
+
|
157 |
+
return chapters
|
158 |
+
|
159 |
+
def _get_chapter_text(self, item) -> str:
|
160 |
+
"""Extract text content from a chapter."""
|
161 |
+
try:
|
162 |
+
soup = BeautifulSoup(item.get_content(), 'html.parser')
|
163 |
+
|
164 |
+
# Remove script and style elements
|
165 |
+
for element in soup(['script', 'style']):
|
166 |
+
element.decompose()
|
167 |
+
|
168 |
+
# Get main content area (usually in body or main tags)
|
169 |
+
content_area = soup.find('body') or soup.find('main') or soup
|
170 |
+
|
171 |
+
# Get all text blocks, excluding navigation elements
|
172 |
+
text_blocks = []
|
173 |
+
for element in content_area.find_all(text=True, recursive=True):
|
174 |
+
if (element.parent.name not in ['script', 'style', 'nav', 'header'] and
|
175 |
+
element.strip()):
|
176 |
+
text_blocks.append(element.strip())
|
177 |
+
|
178 |
+
return '\n\n'.join(text_blocks)
|
179 |
+
|
180 |
+
except Exception as e:
|
181 |
+
print(f"DEBUG: Error extracting text: {str(e)}")
|
182 |
+
# Fallback to simple text extraction
|
183 |
+
return soup.get_text(separator='\n\n', strip=True)
|
184 |
+
|
185 |
+
def cleanup(self):
|
186 |
+
"""Clean up temporary files."""
|
187 |
+
if self._temp_file and os.path.exists(self._temp_file):
|
188 |
+
os.unlink(self._temp_file)
|
189 |
+
self._temp_file = None
|
190 |
+
self._book = None
|
191 |
+
self._chapters = []
|