|
from pathlib import Path |
|
import tempfile |
|
import os |
|
from ebooklib import epub |
|
from bs4 import BeautifulSoup |
|
|
|
class DocumentParser: |
|
"""Simple EPUB document parser that extracts chapters and their content.""" |
|
|
|
def __init__(self): |
|
self._temp_file = None |
|
self._book = None |
|
self._chapters = [] |
|
|
|
def load_document(self, file_data, filename=None) -> list[str]: |
|
"""Load an EPUB document and extract chapter titles. |
|
|
|
Args: |
|
file_data: File data from Gradio (FileData object with read() method) |
|
filename: Optional filename (not used) |
|
""" |
|
|
|
self.cleanup() |
|
|
|
|
|
content = file_data.read() if hasattr(file_data, 'read') else file_data |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.epub') as temp: |
|
temp.write(content) |
|
self._temp_file = temp.name |
|
|
|
|
|
try: |
|
self._book = epub.read_epub(self._temp_file) |
|
print("DEBUG: Successfully read EPUB file") |
|
except Exception as e: |
|
print(f"DEBUG: Error reading EPUB: {str(e)}") |
|
raise ValueError(f"Failed to read EPUB: {str(e)}") |
|
|
|
|
|
self._chapters = self._extract_chapters() |
|
print(f"DEBUG: Extracted {len(self._chapters)} chapters") |
|
|
|
|
|
return [chapter['title'] for chapter in self._chapters] |
|
|
|
def get_chapter_content(self, chapter_idx: int) -> str: |
|
"""Get the content of a specific chapter.""" |
|
if not self._book or not self._chapters: |
|
raise ValueError("No document loaded") |
|
|
|
if not 0 <= chapter_idx < len(self._chapters): |
|
raise ValueError(f"Invalid chapter index: {chapter_idx}") |
|
|
|
chapter = self._chapters[chapter_idx] |
|
self._current_chapter_title = chapter['title'].strip() |
|
|
|
print(f"DEBUG: Getting content for chapter: {self._current_chapter_title}") |
|
content = self._get_chapter_text(chapter['item']) |
|
print(f"DEBUG: Extracted {len(content)} characters of content") |
|
|
|
return content |
|
|
|
def _extract_chapters(self) -> list[dict]: |
|
"""Extract chapters from the EPUB file.""" |
|
chapters = [] |
|
|
|
|
|
print("DEBUG: Checking table of contents...") |
|
if hasattr(self._book, 'toc'): |
|
|
|
print("DEBUG: TOC structure:") |
|
for item in self._book.toc: |
|
print(f"DEBUG: TOC item type: {type(item)}") |
|
if isinstance(item, tuple): |
|
print(f"DEBUG: Tuple length: {len(item)}") |
|
if len(item) > 1: |
|
print(f"DEBUG: Second item type: {type(item[1])}") |
|
if isinstance(item[1], (list, tuple)): |
|
print(f"DEBUG: Sub-items count: {len(item[1])}") |
|
|
|
def process_toc_entries(entries, level=0): |
|
for item in entries: |
|
|
|
if hasattr(item, 'title') and hasattr(item, 'href'): |
|
|
|
doc = self._book.get_item_with_href(item.href) |
|
if doc: |
|
prefix = " " * level if level > 0 else "" |
|
chapters.append({ |
|
'title': prefix + item.title, |
|
'item': doc |
|
}) |
|
elif isinstance(item, tuple): |
|
section = item[0] |
|
|
|
if hasattr(section, 'title') and hasattr(section, 'href'): |
|
doc = self._book.get_item_with_href(section.href) |
|
if doc: |
|
prefix = " " * level if level > 0 else "" |
|
chapters.append({ |
|
'title': prefix + section.title, |
|
'item': doc |
|
}) |
|
|
|
|
|
if len(item) > 1: |
|
if isinstance(item[1], (list, tuple)): |
|
process_toc_entries(item[1], level + 1) |
|
elif hasattr(item[1], 'title'): |
|
process_toc_entries([item[1]], level + 1) |
|
|
|
process_toc_entries(self._book.toc) |
|
print(f"DEBUG: Found {len(chapters)} chapters in TOC") |
|
print("DEBUG: Chapter titles found:") |
|
for ch in chapters: |
|
print(f" - {ch['title']}") |
|
|
|
|
|
if not chapters: |
|
print("DEBUG: No chapters in TOC, scanning documents...") |
|
|
|
docs = [item for item in self._book.get_items() |
|
if item.get_type() == epub.ITEM_DOCUMENT] |
|
|
|
print(f"DEBUG: Found {len(docs)} documents to scan") |
|
|
|
for doc in docs: |
|
soup = BeautifulSoup(doc.get_content(), 'html.parser') |
|
|
|
|
|
headings = ( |
|
soup.find_all(['h1', 'h2']) + |
|
soup.find_all(class_=lambda x: x and ('chapter' in x.lower() or 'title' in x.lower())) |
|
) |
|
|
|
for heading in headings: |
|
|
|
title = ' '.join(heading.get_text().split()) |
|
if title: |
|
chapters.append({ |
|
'title': title, |
|
'item': doc |
|
}) |
|
|
|
if not chapters: |
|
print("DEBUG: No chapters found, using documents as chapters") |
|
|
|
for doc in self._book.get_items(): |
|
if doc.get_type() == epub.ITEM_DOCUMENT: |
|
chapters.append({ |
|
'title': f"Chapter {len(chapters) + 1}", |
|
'item': doc |
|
}) |
|
|
|
return chapters |
|
|
|
def _get_chapter_text(self, item) -> str: |
|
"""Extract text content from a chapter.""" |
|
try: |
|
soup = BeautifulSoup(item.get_content(), 'html.parser') |
|
|
|
|
|
for element in soup(['script', 'style']): |
|
element.decompose() |
|
|
|
|
|
content_area = soup.find('body') or soup.find('main') or soup |
|
|
|
|
|
text_blocks = [] |
|
for element in content_area.find_all(text=True, recursive=True): |
|
if (element.parent.name not in ['script', 'style', 'nav', 'header'] and |
|
element.strip()): |
|
text_blocks.append(element.strip()) |
|
|
|
return '\n\n'.join(text_blocks) |
|
|
|
except Exception as e: |
|
print(f"DEBUG: Error extracting text: {str(e)}") |
|
|
|
return soup.get_text(separator='\n\n', strip=True) |
|
|
|
def cleanup(self): |
|
"""Clean up temporary files.""" |
|
if self._temp_file and os.path.exists(self._temp_file): |
|
os.unlink(self._temp_file) |
|
self._temp_file = None |
|
self._book = None |
|
self._chapters = [] |