File size: 8,125 Bytes
d3c00bf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
from pathlib import Path
import tempfile
import os
from ebooklib import epub
from bs4 import BeautifulSoup
class DocumentParser:
"""Simple EPUB document parser that extracts chapters and their content."""
def __init__(self):
self._temp_file = None
self._book = None
self._chapters = []
def load_document(self, file_data, filename=None) -> list[str]:
"""Load an EPUB document and extract chapter titles.
Args:
file_data: File data from Gradio (FileData object with read() method)
filename: Optional filename (not used)
"""
# Clean up any previous temp file
self.cleanup()
# Get the raw bytes from the Gradio file data
content = file_data.read() if hasattr(file_data, 'read') else file_data
# Save to temp file
with tempfile.NamedTemporaryFile(delete=False, suffix='.epub') as temp:
temp.write(content)
self._temp_file = temp.name
# Read the EPUB
try:
self._book = epub.read_epub(self._temp_file)
print("DEBUG: Successfully read EPUB file")
except Exception as e:
print(f"DEBUG: Error reading EPUB: {str(e)}")
raise ValueError(f"Failed to read EPUB: {str(e)}")
# Extract chapters
self._chapters = self._extract_chapters()
print(f"DEBUG: Extracted {len(self._chapters)} chapters")
# Return chapter titles
return [chapter['title'] for chapter in self._chapters]
def get_chapter_content(self, chapter_idx: int) -> str:
"""Get the content of a specific chapter."""
if not self._book or not self._chapters:
raise ValueError("No document loaded")
if not 0 <= chapter_idx < len(self._chapters):
raise ValueError(f"Invalid chapter index: {chapter_idx}")
chapter = self._chapters[chapter_idx]
self._current_chapter_title = chapter['title'].strip() # Store for _get_chapter_text
print(f"DEBUG: Getting content for chapter: {self._current_chapter_title}")
content = self._get_chapter_text(chapter['item'])
print(f"DEBUG: Extracted {len(content)} characters of content")
return content
def _extract_chapters(self) -> list[dict]:
"""Extract chapters from the EPUB file."""
chapters = []
# First try to get chapters from the table of contents
print("DEBUG: Checking table of contents...")
if hasattr(self._book, 'toc'):
# Debug the TOC structure
print("DEBUG: TOC structure:")
for item in self._book.toc:
print(f"DEBUG: TOC item type: {type(item)}")
if isinstance(item, tuple):
print(f"DEBUG: Tuple length: {len(item)}")
if len(item) > 1:
print(f"DEBUG: Second item type: {type(item[1])}")
if isinstance(item[1], (list, tuple)):
print(f"DEBUG: Sub-items count: {len(item[1])}")
def process_toc_entries(entries, level=0):
for item in entries:
# Handle both Link objects and tuples
if hasattr(item, 'title') and hasattr(item, 'href'):
# Direct Link object
doc = self._book.get_item_with_href(item.href)
if doc:
prefix = " " * level if level > 0 else ""
chapters.append({
'title': prefix + item.title,
'item': doc
})
elif isinstance(item, tuple):
section = item[0]
# Process the section
if hasattr(section, 'title') and hasattr(section, 'href'):
doc = self._book.get_item_with_href(section.href)
if doc:
prefix = " " * level if level > 0 else ""
chapters.append({
'title': prefix + section.title,
'item': doc
})
# Process sub-items if they exist
if len(item) > 1:
if isinstance(item[1], (list, tuple)):
process_toc_entries(item[1], level + 1)
elif hasattr(item[1], 'title'): # Single sub-item
process_toc_entries([item[1]], level + 1)
process_toc_entries(self._book.toc)
print(f"DEBUG: Found {len(chapters)} chapters in TOC")
print("DEBUG: Chapter titles found:")
for ch in chapters:
print(f" - {ch['title']}")
# If no chapters found in TOC, scan the documents
if not chapters:
print("DEBUG: No chapters in TOC, scanning documents...")
# Get all HTML documents
docs = [item for item in self._book.get_items()
if item.get_type() == epub.ITEM_DOCUMENT]
print(f"DEBUG: Found {len(docs)} documents to scan")
for doc in docs:
soup = BeautifulSoup(doc.get_content(), 'html.parser')
# Look for chapter headings
headings = (
soup.find_all(['h1', 'h2']) +
soup.find_all(class_=lambda x: x and ('chapter' in x.lower() or 'title' in x.lower()))
)
for heading in headings:
# Clean up the text
title = ' '.join(heading.get_text().split())
if title: # Only add if we have a title
chapters.append({
'title': title,
'item': doc
})
if not chapters:
print("DEBUG: No chapters found, using documents as chapters")
# If still no chapters found, treat each document as a chapter
for doc in self._book.get_items():
if doc.get_type() == epub.ITEM_DOCUMENT:
chapters.append({
'title': f"Chapter {len(chapters) + 1}",
'item': doc
})
return chapters
def _get_chapter_text(self, item) -> str:
"""Extract text content from a chapter."""
try:
soup = BeautifulSoup(item.get_content(), 'html.parser')
# Remove script and style elements
for element in soup(['script', 'style']):
element.decompose()
# Get main content area (usually in body or main tags)
content_area = soup.find('body') or soup.find('main') or soup
# Get all text blocks, excluding navigation elements
text_blocks = []
for element in content_area.find_all(text=True, recursive=True):
if (element.parent.name not in ['script', 'style', 'nav', 'header'] and
element.strip()):
text_blocks.append(element.strip())
return '\n\n'.join(text_blocks)
except Exception as e:
print(f"DEBUG: Error extracting text: {str(e)}")
# Fallback to simple text extraction
return soup.get_text(separator='\n\n', strip=True)
def cleanup(self):
"""Clean up temporary files."""
if self._temp_file and os.path.exists(self._temp_file):
os.unlink(self._temp_file)
self._temp_file = None
self._book = None
self._chapters = [] |