Spaces:

dwarkesh
/

producer

Running

App Files Files Community

producer / utils /document_parser.py

dwarkesh

I can see chapter titles in the e-pub but I get error on the commentary

d3c00bf 4 months ago

raw

history blame

8.13 kB

	from pathlib import Path
	import tempfile
	import os
	from ebooklib import epub
	from bs4 import BeautifulSoup

	class DocumentParser:
	"""Simple EPUB document parser that extracts chapters and their content."""

	def __init__(self):
	self._temp_file = None
	self._book = None
	self._chapters = []

	def load_document(self, file_data, filename=None) -> list[str]:
	"""Load an EPUB document and extract chapter titles.

	Args:
	file_data: File data from Gradio (FileData object with read() method)
	filename: Optional filename (not used)
	"""
	# Clean up any previous temp file
	self.cleanup()

	# Get the raw bytes from the Gradio file data
	content = file_data.read() if hasattr(file_data, 'read') else file_data

	# Save to temp file
	with tempfile.NamedTemporaryFile(delete=False, suffix='.epub') as temp:
	temp.write(content)
	self._temp_file = temp.name

	# Read the EPUB
	try:
	self._book = epub.read_epub(self._temp_file)
	print("DEBUG: Successfully read EPUB file")
	except Exception as e:
	print(f"DEBUG: Error reading EPUB: {str(e)}")
	raise ValueError(f"Failed to read EPUB: {str(e)}")

	# Extract chapters
	self._chapters = self._extract_chapters()
	print(f"DEBUG: Extracted {len(self._chapters)} chapters")

	# Return chapter titles
	return [chapter['title'] for chapter in self._chapters]

	def get_chapter_content(self, chapter_idx: int) -> str:
	"""Get the content of a specific chapter."""
	if not self._book or not self._chapters:
	raise ValueError("No document loaded")

	if not 0 <= chapter_idx < len(self._chapters):
	raise ValueError(f"Invalid chapter index: {chapter_idx}")

	chapter = self._chapters[chapter_idx]
	self._current_chapter_title = chapter['title'].strip() # Store for _get_chapter_text

	print(f"DEBUG: Getting content for chapter: {self._current_chapter_title}")
	content = self._get_chapter_text(chapter['item'])
	print(f"DEBUG: Extracted {len(content)} characters of content")

	return content

	def _extract_chapters(self) -> list[dict]:
	"""Extract chapters from the EPUB file."""
	chapters = []

	# First try to get chapters from the table of contents
	print("DEBUG: Checking table of contents...")
	if hasattr(self._book, 'toc'):
	# Debug the TOC structure
	print("DEBUG: TOC structure:")
	for item in self._book.toc:
	print(f"DEBUG: TOC item type: {type(item)}")
	if isinstance(item, tuple):
	print(f"DEBUG: Tuple length: {len(item)}")
	if len(item) > 1:
	print(f"DEBUG: Second item type: {type(item[1])}")
	if isinstance(item[1], (list, tuple)):
	print(f"DEBUG: Sub-items count: {len(item[1])}")

	def process_toc_entries(entries, level=0):
	for item in entries:
	# Handle both Link objects and tuples
	if hasattr(item, 'title') and hasattr(item, 'href'):
	# Direct Link object
	doc = self._book.get_item_with_href(item.href)
	if doc:
	prefix = " " * level if level > 0 else ""
	chapters.append({
	'title': prefix + item.title,
	'item': doc
	})
	elif isinstance(item, tuple):
	section = item[0]
	# Process the section
	if hasattr(section, 'title') and hasattr(section, 'href'):
	doc = self._book.get_item_with_href(section.href)
	if doc:
	prefix = " " * level if level > 0 else ""
	chapters.append({
	'title': prefix + section.title,
	'item': doc
	})

	# Process sub-items if they exist
	if len(item) > 1:
	if isinstance(item[1], (list, tuple)):
	process_toc_entries(item[1], level + 1)
	elif hasattr(item[1], 'title'): # Single sub-item
	process_toc_entries([item[1]], level + 1)

	process_toc_entries(self._book.toc)
	print(f"DEBUG: Found {len(chapters)} chapters in TOC")
	print("DEBUG: Chapter titles found:")
	for ch in chapters:
	print(f" - {ch['title']}")

	# If no chapters found in TOC, scan the documents
	if not chapters:
	print("DEBUG: No chapters in TOC, scanning documents...")
	# Get all HTML documents
	docs = [item for item in self._book.get_items()
	if item.get_type() == epub.ITEM_DOCUMENT]

	print(f"DEBUG: Found {len(docs)} documents to scan")

	for doc in docs:
	soup = BeautifulSoup(doc.get_content(), 'html.parser')

	# Look for chapter headings
	headings = (
	soup.find_all(['h1', 'h2']) +
	soup.find_all(class_=lambda x: x and ('chapter' in x.lower() or 'title' in x.lower()))
	)

	for heading in headings:
	# Clean up the text
	title = ' '.join(heading.get_text().split())
	if title: # Only add if we have a title
	chapters.append({
	'title': title,
	'item': doc
	})

	if not chapters:
	print("DEBUG: No chapters found, using documents as chapters")
	# If still no chapters found, treat each document as a chapter
	for doc in self._book.get_items():
	if doc.get_type() == epub.ITEM_DOCUMENT:
	chapters.append({
	'title': f"Chapter {len(chapters) + 1}",
	'item': doc
	})

	return chapters

	def _get_chapter_text(self, item) -> str:
	"""Extract text content from a chapter."""
	try:
	soup = BeautifulSoup(item.get_content(), 'html.parser')

	# Remove script and style elements
	for element in soup(['script', 'style']):
	element.decompose()

	# Get main content area (usually in body or main tags)
	content_area = soup.find('body') or soup.find('main') or soup

	# Get all text blocks, excluding navigation elements
	text_blocks = []
	for element in content_area.find_all(text=True, recursive=True):
	if (element.parent.name not in ['script', 'style', 'nav', 'header'] and
	element.strip()):
	text_blocks.append(element.strip())

	return '\n\n'.join(text_blocks)

	except Exception as e:
	print(f"DEBUG: Error extracting text: {str(e)}")
	# Fallback to simple text extraction
	return soup.get_text(separator='\n\n', strip=True)

	def cleanup(self):
	"""Clean up temporary files."""
	if self._temp_file and os.path.exists(self._temp_file):
	os.unlink(self._temp_file)
	self._temp_file = None
	self._book = None
	self._chapters = []