dwarkesh commited on
Commit
d3c00bf
·
1 Parent(s): f8fd6b5

I can see chapter titles in the e-pub but I get error on the commentary

Browse files
README.md CHANGED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Run the reader app
2
+ python apps/reader.py
3
+
4
+ # Run the producer app
5
+ python apps/producer.py
6
+
7
+ # Run a script
8
+ python scripts/transcript.py audio_file.mp3
apps/reader.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from pathlib import Path
3
+
4
+ # Add project root to Python path
5
+ project_root = str(Path(__file__).parent.parent)
6
+ if project_root not in sys.path:
7
+ sys.path.append(project_root)
8
+
9
+ import gradio as gr
10
+ import asyncio
11
+ import os
12
+ import json
13
+ import requests
14
+ from anthropic import Anthropic
15
+ from utils.document_parser import DocumentParser
16
+ from dotenv import load_dotenv
17
+
18
+ # Load environment variables
19
+ env_path = Path(project_root) / ".env"
20
+ load_dotenv(env_path)
21
+
22
+ # Mochi deck IDs
23
+ DECK_CATEGORIES = {
24
+ "CS/Hardware": "rhGqR9SK",
25
+ "Math/Physics": "Dm5vczZg",
26
+ "AI": "SS9QEfiy",
27
+ "History/Military": "3nJYp7Zh",
28
+ "Quotes/Random": "rWUzSu8t",
29
+ "Bio": "BspzxaUJ",
30
+ "Econ/Finance": "mvvJ27Q1"
31
+ }
32
+
33
+ class CardGenerator:
34
+ """Handles card generation and Mochi integration."""
35
+
36
+ def __init__(self):
37
+ self.parser = DocumentParser()
38
+ self.claude = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
39
+ self.mochi_key = os.getenv("MOCHI_API_KEY")
40
+
41
+ # Load prompts
42
+ self.prompts = {
43
+ key: Path(f"prompts/{key}.txt").read_text()
44
+ for key in ["card_generation", "commentary"]
45
+ }
46
+
47
+ # State
48
+ self.current_cards = []
49
+ self.current_index = 0
50
+ self.approved_cards = []
51
+
52
+ def get_chapter_list(self, file_data) -> list[str]:
53
+ """Get list of chapters from document.
54
+
55
+ Args:
56
+ file_data: File data from Gradio
57
+ """
58
+ try:
59
+ if not file_data:
60
+ return []
61
+
62
+ # Attempt to extract filename from file_data
63
+ filename = getattr(file_data, 'name', None)
64
+ if not filename:
65
+ filename = "uploaded_file"
66
+ print("DEBUG: No filename attribute found, using default.")
67
+ else:
68
+ print(f"DEBUG: Filename extracted: {filename}")
69
+
70
+ # Check file extension
71
+ file_ext = Path(filename).suffix.lower()
72
+ if not file_ext:
73
+ print("DEBUG: No file extension found, checking content type.")
74
+ # Attempt to determine file type from content
75
+ if file_data.startswith(b'%PDF-'):
76
+ file_ext = '.pdf'
77
+ elif file_data.startswith(b'PK'):
78
+ file_ext = '.epub'
79
+ else:
80
+ raise ValueError("Unsupported file type")
81
+ print(f"DEBUG: File extension: {file_ext}")
82
+
83
+ return self.parser.load_document(file_data, filename)
84
+ except Exception as e:
85
+ return [f"Error: {str(e)}"]
86
+
87
+ async def process_chapter(self, file_data, chapter_idx: int) -> tuple:
88
+ """Process chapter and generate cards + commentary.
89
+
90
+ Args:
91
+ file_data: File data from Gradio
92
+ chapter_idx: Index of chapter to process
93
+ """
94
+ try:
95
+ if not file_data:
96
+ return None, "No file provided"
97
+
98
+ # Get chapter content
99
+ content = self.parser.get_chapter_content(chapter_idx)
100
+
101
+ # Generate cards and commentary
102
+ cards, commentary = await asyncio.gather(
103
+ self._generate_cards(content),
104
+ self._generate_commentary(content)
105
+ )
106
+
107
+ # Parse and store cards
108
+ self.current_cards = json.loads(cards)
109
+ self.current_index = 0
110
+ self.approved_cards = []
111
+
112
+ # Return first card and commentary
113
+ return self._get_current_card(), commentary
114
+
115
+ except Exception as e:
116
+ return None, f"Error: {str(e)}"
117
+ finally:
118
+ self.parser.cleanup()
119
+
120
+ async def _generate_cards(self, content: str) -> str:
121
+ """Generate flashcards using Claude."""
122
+ response = await self.claude.messages.create(
123
+ model="claude-3-opus-20240229",
124
+ max_tokens=4000,
125
+ system=self.prompts["card_generation"],
126
+ messages=[{"role": "user", "content": content}]
127
+ )
128
+ return response.content[0].text
129
+
130
+ async def _generate_commentary(self, content: str) -> str:
131
+ """Generate commentary using Claude."""
132
+ response = await self.claude.messages.create(
133
+ model="claude-3-opus-20240229",
134
+ max_tokens=4000,
135
+ system=self.prompts["commentary"],
136
+ messages=[{"role": "user", "content": content}]
137
+ )
138
+ return response.content[0].text
139
+
140
+ def _get_current_card(self) -> dict:
141
+ """Get current card with UI state."""
142
+ if not self.current_cards or self.current_index >= len(self.current_cards):
143
+ return {
144
+ 'front': "",
145
+ 'back': "",
146
+ 'category': "",
147
+ 'status': "No more cards to review",
148
+ 'show_buttons': False,
149
+ 'show_upload': True
150
+ }
151
+
152
+ card = self.current_cards[self.current_index]
153
+ return {
154
+ 'front': card['front'],
155
+ 'back': card['back'],
156
+ 'category': card['category'],
157
+ 'status': f"Card {self.current_index + 1} of {len(self.current_cards)}",
158
+ 'show_buttons': True,
159
+ 'show_upload': False
160
+ }
161
+
162
+ def accept_card(self, front: str, back: str, category: str) -> dict:
163
+ """Accept current card and move to next."""
164
+ if self.current_index < len(self.current_cards):
165
+ self.approved_cards.append({
166
+ 'front': front,
167
+ 'back': back,
168
+ 'category': category
169
+ })
170
+
171
+ self.current_index += 1
172
+ return self._get_current_card()
173
+
174
+ def reject_card(self) -> dict:
175
+ """Reject current card and move to next."""
176
+ if self.current_index < len(self.current_cards):
177
+ self.current_cards.pop(self.current_index)
178
+ return self._get_current_card()
179
+
180
+ def upload_to_mochi(self) -> str:
181
+ """Upload approved cards to Mochi."""
182
+ if not self.approved_cards:
183
+ return "No cards to upload!"
184
+
185
+ results = []
186
+ for card in self.approved_cards:
187
+ try:
188
+ # Format card for Mochi
189
+ mochi_card = {
190
+ "deck-id": DECK_CATEGORIES[card["category"]],
191
+ "fields": {
192
+ "name": {"id": "name", "value": card["front"]},
193
+ "back": {"id": "back", "value": card["back"]}
194
+ }
195
+ }
196
+
197
+ # Upload to Mochi
198
+ response = requests.post(
199
+ "https://app.mochi.cards/api/cards",
200
+ json=mochi_card,
201
+ auth=(self.mochi_key, "")
202
+ )
203
+
204
+ if response.status_code != 200:
205
+ results.append(f"Error: {response.text}")
206
+
207
+ except Exception as e:
208
+ results.append(f"Error: {str(e)}")
209
+
210
+ # Clear approved cards
211
+ success_count = len(self.approved_cards) - len(results)
212
+ self.approved_cards = []
213
+
214
+ if results:
215
+ return f"Uploaded {success_count} cards with {len(results)} errors:\n" + "\n".join(results)
216
+ return f"Successfully uploaded {success_count} cards to Mochi!"
217
+
218
+ def create_interface():
219
+ """Create the Gradio interface."""
220
+ generator = CardGenerator()
221
+
222
+ with gr.Blocks(title="Document Reader & Card Generator") as app:
223
+ # Document upload and chapter selection
224
+ with gr.Row():
225
+ file_input = gr.File(
226
+ label="Upload EPUB Document",
227
+ type="binary",
228
+ file_types=[".epub"]
229
+ )
230
+
231
+ chapter_select = gr.Dropdown(
232
+ label="Select Chapter",
233
+ choices=[],
234
+ interactive=True,
235
+ visible=False
236
+ )
237
+
238
+ def update_chapters(file):
239
+ if not file:
240
+ return gr.update(choices=[], visible=False)
241
+ chapters = generator.get_chapter_list(file)
242
+ return gr.update(choices=chapters, visible=True, value=chapters[0] if chapters else None)
243
+
244
+ file_input.change(
245
+ fn=update_chapters,
246
+ inputs=[file_input],
247
+ outputs=[chapter_select]
248
+ )
249
+
250
+ process_btn = gr.Button("Process Chapter")
251
+
252
+ # Commentary section
253
+ commentary = gr.Textbox(
254
+ label="Commentary",
255
+ lines=10,
256
+ interactive=False
257
+ )
258
+
259
+ # Card review section
260
+ gr.Markdown("## Review Cards")
261
+
262
+ with gr.Row():
263
+ card_front = gr.Textbox(
264
+ label="Front",
265
+ lines=3,
266
+ interactive=True
267
+ )
268
+ card_back = gr.Textbox(
269
+ label="Back",
270
+ lines=3,
271
+ interactive=True
272
+ )
273
+
274
+ with gr.Row():
275
+ deck_category = gr.Dropdown(
276
+ choices=list(DECK_CATEGORIES.keys()),
277
+ label="Deck Category",
278
+ value="AI"
279
+ )
280
+ card_status = gr.Textbox(
281
+ label="Status",
282
+ interactive=False
283
+ )
284
+
285
+ with gr.Row():
286
+ accept_btn = gr.Button("Accept & Next", visible=False)
287
+ reject_btn = gr.Button("Reject & Next", visible=False)
288
+ upload_btn = gr.Button("Upload to Mochi", visible=False)
289
+
290
+ upload_status = gr.Textbox(
291
+ label="Upload Status",
292
+ interactive=False
293
+ )
294
+
295
+ # Event handlers
296
+ async def process_chapter(file, chapter_idx):
297
+ card, comment = await generator.process_chapter(file, chapter_idx)
298
+ if not card: # Error occurred
299
+ return [
300
+ "", "", comment, gr.update(visible=False),
301
+ gr.update(visible=False), "", gr.update(visible=False)
302
+ ]
303
+
304
+ return [
305
+ card['front'],
306
+ card['back'],
307
+ comment,
308
+ gr.update(visible=card['show_buttons']),
309
+ gr.update(visible=card['show_buttons']),
310
+ card['status'],
311
+ gr.update(visible=card['show_upload'])
312
+ ]
313
+
314
+ def handle_card_action(action, front, back, category):
315
+ card = (generator.accept_card(front, back, category)
316
+ if action == 'accept' else
317
+ generator.reject_card())
318
+
319
+ return [
320
+ card['front'],
321
+ card['back'],
322
+ card['status'],
323
+ gr.update(visible=card['show_buttons']),
324
+ gr.update(visible=card['show_buttons']),
325
+ card['category'],
326
+ gr.update(visible=card['show_upload'])
327
+ ]
328
+
329
+ # Connect events
330
+ process_btn.click(
331
+ fn=process_chapter,
332
+ inputs=[file_input, chapter_select],
333
+ outputs=[
334
+ card_front, card_back, commentary,
335
+ accept_btn, reject_btn, card_status, upload_btn
336
+ ]
337
+ )
338
+
339
+ accept_btn.click(
340
+ fn=lambda f, b, c: handle_card_action('accept', f, b, c),
341
+ inputs=[card_front, card_back, deck_category],
342
+ outputs=[
343
+ card_front, card_back, card_status,
344
+ accept_btn, reject_btn, deck_category, upload_btn
345
+ ]
346
+ )
347
+
348
+ reject_btn.click(
349
+ fn=lambda: handle_card_action('reject', None, None, None),
350
+ outputs=[
351
+ card_front, card_back, card_status,
352
+ accept_btn, reject_btn, deck_category, upload_btn
353
+ ]
354
+ )
355
+
356
+ upload_btn.click(
357
+ fn=generator.upload_to_mochi,
358
+ outputs=[upload_status]
359
+ )
360
+
361
+ return app
362
+
363
+ if __name__ == "__main__":
364
+ create_interface().launch()
prompts/card_generation.txt ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are an expert at creating high-quality spaced repetition flashcards that promote deep understanding and retention. Your task is to generate flashcards from the given text that are:
2
+
3
+ 1. Clear and concise
4
+ 2. Focus on one concept per card
5
+ 3. Test understanding rather than just recall
6
+ 4. Avoid overly complex or compound questions
7
+ 5. Use precise language
8
+
9
+ Each card must be assigned to one of these categories:
10
+ - CS/Hardware
11
+ - Math/Physics
12
+ - AI
13
+ - History/Military
14
+ - Quotes/Random
15
+ - Bio
16
+ - Econ/Finance
17
+
18
+ Format each card as a JSON object:
19
+ {
20
+ "category": "Category name from the list above",
21
+ "front": "Question or prompt",
22
+ "back": "Answer or explanation"
23
+ }
24
+
25
+ Example cards:
26
+ {
27
+ "category": "Bio",
28
+ "front": "What is the key difference between procedural and declarative memory?",
29
+ "back": "Procedural memory is for skills and procedures (how to ride a bike), while declarative memory is for facts and events (what you had for breakfast)."
30
+ }
31
+
32
+ {
33
+ "category": "Bio",
34
+ "front": "What role does the hippocampus play in memory formation?",
35
+ "back": "The hippocampus is crucial for converting short-term memories into long-term memories through a process called consolidation. It acts as a temporary storage and processing center before memories are distributed to other parts of the cortex."
36
+ }
37
+
38
+ Please generate 5-10 high-quality flashcards from the provided text. Focus on the most important concepts, insights, and relationships. Format the output as a JSON array containing the card objects.
prompts/commentary.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are an expert researcher and critical thinker. Your task is to analyze the provided text and generate insightful commentary that:
2
+
3
+ 1. Identifies the key arguments, insights, and novel ideas
4
+ 2. Highlights connections to other important concepts or fields
5
+ 3. Points out particularly interesting or counterintuitive points
6
+ 4. Suggests areas that merit further exploration
7
+ 5. Notes any potential weaknesses or areas of uncertainty in the arguments
8
+
9
+ Your commentary should be scholarly but engaging, helping the reader develop a deeper understanding of the material. Focus on substance over style, and be specific rather than general.
10
+
11
+ Structure your response as follows:
12
+
13
+ Key Insights:
14
+ - [2-3 bullet points highlighting the most important takeaways]
15
+
16
+ Interesting Connections:
17
+ - [2-3 bullet points noting connections to other fields/concepts]
18
+
19
+ Worth Exploring Further:
20
+ - [1-2 bullet points suggesting related areas for deeper investigation]
21
+
22
+ Critical Notes:
23
+ - [1-2 bullet points on potential weaknesses or areas needing clarification]
24
+
25
+ Then provide 2-3 paragraphs of integrated analysis that weaves these points together into a coherent commentary.
requirements.txt CHANGED
@@ -6,4 +6,8 @@ pandas
6
  youtube-transcript-api
7
  pydub
8
  assemblyai
9
- pytube
 
 
 
 
 
6
  youtube-transcript-api
7
  pydub
8
  assemblyai
9
+ pytube
10
+ PyPDF2
11
+ EbookLib
12
+ beautifulsoup4
13
+ python-dotenv
utils/document_parser.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import tempfile
3
+ import os
4
+ from ebooklib import epub
5
+ from bs4 import BeautifulSoup
6
+
7
+ class DocumentParser:
8
+ """Simple EPUB document parser that extracts chapters and their content."""
9
+
10
+ def __init__(self):
11
+ self._temp_file = None
12
+ self._book = None
13
+ self._chapters = []
14
+
15
+ def load_document(self, file_data, filename=None) -> list[str]:
16
+ """Load an EPUB document and extract chapter titles.
17
+
18
+ Args:
19
+ file_data: File data from Gradio (FileData object with read() method)
20
+ filename: Optional filename (not used)
21
+ """
22
+ # Clean up any previous temp file
23
+ self.cleanup()
24
+
25
+ # Get the raw bytes from the Gradio file data
26
+ content = file_data.read() if hasattr(file_data, 'read') else file_data
27
+
28
+ # Save to temp file
29
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.epub') as temp:
30
+ temp.write(content)
31
+ self._temp_file = temp.name
32
+
33
+ # Read the EPUB
34
+ try:
35
+ self._book = epub.read_epub(self._temp_file)
36
+ print("DEBUG: Successfully read EPUB file")
37
+ except Exception as e:
38
+ print(f"DEBUG: Error reading EPUB: {str(e)}")
39
+ raise ValueError(f"Failed to read EPUB: {str(e)}")
40
+
41
+ # Extract chapters
42
+ self._chapters = self._extract_chapters()
43
+ print(f"DEBUG: Extracted {len(self._chapters)} chapters")
44
+
45
+ # Return chapter titles
46
+ return [chapter['title'] for chapter in self._chapters]
47
+
48
+ def get_chapter_content(self, chapter_idx: int) -> str:
49
+ """Get the content of a specific chapter."""
50
+ if not self._book or not self._chapters:
51
+ raise ValueError("No document loaded")
52
+
53
+ if not 0 <= chapter_idx < len(self._chapters):
54
+ raise ValueError(f"Invalid chapter index: {chapter_idx}")
55
+
56
+ chapter = self._chapters[chapter_idx]
57
+ self._current_chapter_title = chapter['title'].strip() # Store for _get_chapter_text
58
+
59
+ print(f"DEBUG: Getting content for chapter: {self._current_chapter_title}")
60
+ content = self._get_chapter_text(chapter['item'])
61
+ print(f"DEBUG: Extracted {len(content)} characters of content")
62
+
63
+ return content
64
+
65
+ def _extract_chapters(self) -> list[dict]:
66
+ """Extract chapters from the EPUB file."""
67
+ chapters = []
68
+
69
+ # First try to get chapters from the table of contents
70
+ print("DEBUG: Checking table of contents...")
71
+ if hasattr(self._book, 'toc'):
72
+ # Debug the TOC structure
73
+ print("DEBUG: TOC structure:")
74
+ for item in self._book.toc:
75
+ print(f"DEBUG: TOC item type: {type(item)}")
76
+ if isinstance(item, tuple):
77
+ print(f"DEBUG: Tuple length: {len(item)}")
78
+ if len(item) > 1:
79
+ print(f"DEBUG: Second item type: {type(item[1])}")
80
+ if isinstance(item[1], (list, tuple)):
81
+ print(f"DEBUG: Sub-items count: {len(item[1])}")
82
+
83
+ def process_toc_entries(entries, level=0):
84
+ for item in entries:
85
+ # Handle both Link objects and tuples
86
+ if hasattr(item, 'title') and hasattr(item, 'href'):
87
+ # Direct Link object
88
+ doc = self._book.get_item_with_href(item.href)
89
+ if doc:
90
+ prefix = " " * level if level > 0 else ""
91
+ chapters.append({
92
+ 'title': prefix + item.title,
93
+ 'item': doc
94
+ })
95
+ elif isinstance(item, tuple):
96
+ section = item[0]
97
+ # Process the section
98
+ if hasattr(section, 'title') and hasattr(section, 'href'):
99
+ doc = self._book.get_item_with_href(section.href)
100
+ if doc:
101
+ prefix = " " * level if level > 0 else ""
102
+ chapters.append({
103
+ 'title': prefix + section.title,
104
+ 'item': doc
105
+ })
106
+
107
+ # Process sub-items if they exist
108
+ if len(item) > 1:
109
+ if isinstance(item[1], (list, tuple)):
110
+ process_toc_entries(item[1], level + 1)
111
+ elif hasattr(item[1], 'title'): # Single sub-item
112
+ process_toc_entries([item[1]], level + 1)
113
+
114
+ process_toc_entries(self._book.toc)
115
+ print(f"DEBUG: Found {len(chapters)} chapters in TOC")
116
+ print("DEBUG: Chapter titles found:")
117
+ for ch in chapters:
118
+ print(f" - {ch['title']}")
119
+
120
+ # If no chapters found in TOC, scan the documents
121
+ if not chapters:
122
+ print("DEBUG: No chapters in TOC, scanning documents...")
123
+ # Get all HTML documents
124
+ docs = [item for item in self._book.get_items()
125
+ if item.get_type() == epub.ITEM_DOCUMENT]
126
+
127
+ print(f"DEBUG: Found {len(docs)} documents to scan")
128
+
129
+ for doc in docs:
130
+ soup = BeautifulSoup(doc.get_content(), 'html.parser')
131
+
132
+ # Look for chapter headings
133
+ headings = (
134
+ soup.find_all(['h1', 'h2']) +
135
+ soup.find_all(class_=lambda x: x and ('chapter' in x.lower() or 'title' in x.lower()))
136
+ )
137
+
138
+ for heading in headings:
139
+ # Clean up the text
140
+ title = ' '.join(heading.get_text().split())
141
+ if title: # Only add if we have a title
142
+ chapters.append({
143
+ 'title': title,
144
+ 'item': doc
145
+ })
146
+
147
+ if not chapters:
148
+ print("DEBUG: No chapters found, using documents as chapters")
149
+ # If still no chapters found, treat each document as a chapter
150
+ for doc in self._book.get_items():
151
+ if doc.get_type() == epub.ITEM_DOCUMENT:
152
+ chapters.append({
153
+ 'title': f"Chapter {len(chapters) + 1}",
154
+ 'item': doc
155
+ })
156
+
157
+ return chapters
158
+
159
+ def _get_chapter_text(self, item) -> str:
160
+ """Extract text content from a chapter."""
161
+ try:
162
+ soup = BeautifulSoup(item.get_content(), 'html.parser')
163
+
164
+ # Remove script and style elements
165
+ for element in soup(['script', 'style']):
166
+ element.decompose()
167
+
168
+ # Get main content area (usually in body or main tags)
169
+ content_area = soup.find('body') or soup.find('main') or soup
170
+
171
+ # Get all text blocks, excluding navigation elements
172
+ text_blocks = []
173
+ for element in content_area.find_all(text=True, recursive=True):
174
+ if (element.parent.name not in ['script', 'style', 'nav', 'header'] and
175
+ element.strip()):
176
+ text_blocks.append(element.strip())
177
+
178
+ return '\n\n'.join(text_blocks)
179
+
180
+ except Exception as e:
181
+ print(f"DEBUG: Error extracting text: {str(e)}")
182
+ # Fallback to simple text extraction
183
+ return soup.get_text(separator='\n\n', strip=True)
184
+
185
+ def cleanup(self):
186
+ """Clean up temporary files."""
187
+ if self._temp_file and os.path.exists(self._temp_file):
188
+ os.unlink(self._temp_file)
189
+ self._temp_file = None
190
+ self._book = None
191
+ self._chapters = []