Spaces:

dwarkesh
/

producer

Running

App Files Files

xet

Community

dwarkesh commited on Jan 5

Commit

d3c00bf

1 Parent(s): f8fd6b5

I can see chapter titles in the e-pub but I get error on the commentary

Browse files

Files changed (6) hide show

README.md +8 -0
apps/reader.py +364 -0
prompts/card_generation.txt +38 -0
prompts/commentary.txt +25 -0
requirements.txt +5 -1
utils/document_parser.py +191 -0

README.md CHANGED Viewed

	@@ -0,0 +1,8 @@

+# Run the reader app
+python apps/reader.py
+# Run the producer app
+python apps/producer.py
+# Run a script
+python scripts/transcript.py audio_file.mp3

apps/reader.py ADDED Viewed

	@@ -0,0 +1,364 @@

+import sys
+from pathlib import Path
+# Add project root to Python path
+project_root = str(Path(__file__).parent.parent)
+if project_root not in sys.path:
+    sys.path.append(project_root)
+import gradio as gr
+import asyncio
+import os
+import json
+import requests
+from anthropic import Anthropic
+from utils.document_parser import DocumentParser
+from dotenv import load_dotenv
+# Load environment variables
+env_path = Path(project_root) / ".env"
+load_dotenv(env_path)
+# Mochi deck IDs
+DECK_CATEGORIES = {
+    "CS/Hardware": "rhGqR9SK",
+    "Math/Physics": "Dm5vczZg",
+    "AI": "SS9QEfiy",
+    "History/Military": "3nJYp7Zh",
+    "Quotes/Random": "rWUzSu8t",
+    "Bio": "BspzxaUJ",
+    "Econ/Finance": "mvvJ27Q1"
+}
+class CardGenerator:
+    """Handles card generation and Mochi integration."""
+    def __init__(self):
+        self.parser = DocumentParser()
+        self.claude = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
+        self.mochi_key = os.getenv("MOCHI_API_KEY")
+        # Load prompts
+        self.prompts = {
+            key: Path(f"prompts/{key}.txt").read_text()
+            for key in ["card_generation", "commentary"]
+        }
+        # State
+        self.current_cards = []
+        self.current_index = 0
+        self.approved_cards = []
+    def get_chapter_list(self, file_data) -> list[str]:
+        """Get list of chapters from document.
+        Args:
+            file_data: File data from Gradio
+        """
+        try:
+            if not file_data:
+                return []
+            # Attempt to extract filename from file_data
+            filename = getattr(file_data, 'name', None)
+            if not filename:
+                filename = "uploaded_file"
+                print("DEBUG: No filename attribute found, using default.")
+            else:
+                print(f"DEBUG: Filename extracted: {filename}")
+            # Check file extension
+            file_ext = Path(filename).suffix.lower()
+            if not file_ext:
+                print("DEBUG: No file extension found, checking content type.")
+                # Attempt to determine file type from content
+                if file_data.startswith(b'%PDF-'):
+                    file_ext = '.pdf'
+                elif file_data.startswith(b'PK'):
+                    file_ext = '.epub'
+                else:
+                    raise ValueError("Unsupported file type")
+            print(f"DEBUG: File extension: {file_ext}")
+            return self.parser.load_document(file_data, filename)
+        except Exception as e:
+            return [f"Error: {str(e)}"]
+    async def process_chapter(self, file_data, chapter_idx: int) -> tuple:
+        """Process chapter and generate cards + commentary.
+        Args:
+            file_data: File data from Gradio
+            chapter_idx: Index of chapter to process
+        """
+        try:
+            if not file_data:
+                return None, "No file provided"
+            # Get chapter content
+            content = self.parser.get_chapter_content(chapter_idx)
+            # Generate cards and commentary
+            cards, commentary = await asyncio.gather(
+                self._generate_cards(content),
+                self._generate_commentary(content)
+            )
+            # Parse and store cards
+            self.current_cards = json.loads(cards)
+            self.current_index = 0
+            self.approved_cards = []
+            # Return first card and commentary
+            return self._get_current_card(), commentary
+        except Exception as e:
+            return None, f"Error: {str(e)}"
+        finally:
+            self.parser.cleanup()
+    async def _generate_cards(self, content: str) -> str:
+        """Generate flashcards using Claude."""
+        response = await self.claude.messages.create(
+            model="claude-3-opus-20240229",
+            max_tokens=4000,
+            system=self.prompts["card_generation"],
+            messages=[{"role": "user", "content": content}]
+        )
+        return response.content[0].text
+    async def _generate_commentary(self, content: str) -> str:
+        """Generate commentary using Claude."""
+        response = await self.claude.messages.create(
+            model="claude-3-opus-20240229",
+            max_tokens=4000,
+            system=self.prompts["commentary"],
+            messages=[{"role": "user", "content": content}]
+        )
+        return response.content[0].text
+    def _get_current_card(self) -> dict:
+        """Get current card with UI state."""
+        if not self.current_cards or self.current_index >= len(self.current_cards):
+            return {
+                'front': "",
+                'back': "",
+                'category': "",
+                'status': "No more cards to review",
+                'show_buttons': False,
+                'show_upload': True
+            }
+        card = self.current_cards[self.current_index]
+        return {
+            'front': card['front'],
+            'back': card['back'],
+            'category': card['category'],
+            'status': f"Card {self.current_index + 1} of {len(self.current_cards)}",
+            'show_buttons': True,
+            'show_upload': False
+        }
+    def accept_card(self, front: str, back: str, category: str) -> dict:
+        """Accept current card and move to next."""
+        if self.current_index < len(self.current_cards):
+            self.approved_cards.append({
+                'front': front,
+                'back': back,
+                'category': category
+            })
+        self.current_index += 1
+        return self._get_current_card()
+    def reject_card(self) -> dict:
+        """Reject current card and move to next."""
+        if self.current_index < len(self.current_cards):
+            self.current_cards.pop(self.current_index)
+        return self._get_current_card()
+    def upload_to_mochi(self) -> str:
+        """Upload approved cards to Mochi."""
+        if not self.approved_cards:
+            return "No cards to upload!"
+        results = []
+        for card in self.approved_cards:
+            try:
+                # Format card for Mochi
+                mochi_card = {
+                    "deck-id": DECK_CATEGORIES[card["category"]],
+                    "fields": {
+                        "name": {"id": "name", "value": card["front"]},
+                        "back": {"id": "back", "value": card["back"]}
+                    }
+                }
+                # Upload to Mochi
+                response = requests.post(
+                    "https://app.mochi.cards/api/cards",
+                    json=mochi_card,
+                    auth=(self.mochi_key, "")
+                )
+                if response.status_code != 200:
+                    results.append(f"Error: {response.text}")
+            except Exception as e:
+                results.append(f"Error: {str(e)}")
+        # Clear approved cards
+        success_count = len(self.approved_cards) - len(results)
+        self.approved_cards = []
+        if results:
+            return f"Uploaded {success_count} cards with {len(results)} errors:\n" + "\n".join(results)
+        return f"Successfully uploaded {success_count} cards to Mochi!"
+def create_interface():
+    """Create the Gradio interface."""
+    generator = CardGenerator()
+    with gr.Blocks(title="Document Reader & Card Generator") as app:
+        # Document upload and chapter selection
+        with gr.Row():
+            file_input = gr.File(
+                label="Upload EPUB Document",
+                type="binary",
+                file_types=[".epub"]
+            )
+        chapter_select = gr.Dropdown(
+            label="Select Chapter",
+            choices=[],
+            interactive=True,
+            visible=False
+        )
+        def update_chapters(file):
+            if not file:
+                return gr.update(choices=[], visible=False)
+            chapters = generator.get_chapter_list(file)
+            return gr.update(choices=chapters, visible=True, value=chapters[0] if chapters else None)
+        file_input.change(
+            fn=update_chapters,
+            inputs=[file_input],
+            outputs=[chapter_select]
+        )
+        process_btn = gr.Button("Process Chapter")
+        # Commentary section
+        commentary = gr.Textbox(
+            label="Commentary",
+            lines=10,
+            interactive=False
+        )
+        # Card review section
+        gr.Markdown("## Review Cards")
+        with gr.Row():
+            card_front = gr.Textbox(
+                label="Front",
+                lines=3,
+                interactive=True
+            )
+            card_back = gr.Textbox(
+                label="Back",
+                lines=3,
+                interactive=True
+            )
+        with gr.Row():
+            deck_category = gr.Dropdown(
+                choices=list(DECK_CATEGORIES.keys()),
+                label="Deck Category",
+                value="AI"
+            )
+            card_status = gr.Textbox(
+                label="Status",
+                interactive=False
+            )
+        with gr.Row():
+            accept_btn = gr.Button("Accept & Next", visible=False)
+            reject_btn = gr.Button("Reject & Next", visible=False)
+            upload_btn = gr.Button("Upload to Mochi", visible=False)
+        upload_status = gr.Textbox(
+            label="Upload Status",
+            interactive=False
+        )
+        # Event handlers
+        async def process_chapter(file, chapter_idx):
+            card, comment = await generator.process_chapter(file, chapter_idx)
+            if not card:  # Error occurred
+                return [
+                    "", "", comment, gr.update(visible=False),
+                    gr.update(visible=False), "", gr.update(visible=False)
+                ]
+            return [
+                card['front'],
+                card['back'],
+                comment,
+                gr.update(visible=card['show_buttons']),
+                gr.update(visible=card['show_buttons']),
+                card['status'],
+                gr.update(visible=card['show_upload'])
+            ]
+        def handle_card_action(action, front, back, category):
+            card = (generator.accept_card(front, back, category)
+                   if action == 'accept' else
+                   generator.reject_card())
+            return [
+                card['front'],
+                card['back'],
+                card['status'],
+                gr.update(visible=card['show_buttons']),
+                gr.update(visible=card['show_buttons']),
+                card['category'],
+                gr.update(visible=card['show_upload'])
+            ]
+        # Connect events
+        process_btn.click(
+            fn=process_chapter,
+            inputs=[file_input, chapter_select],
+            outputs=[
+                card_front, card_back, commentary,
+                accept_btn, reject_btn, card_status, upload_btn
+            ]
+        )
+        accept_btn.click(
+            fn=lambda f, b, c: handle_card_action('accept', f, b, c),
+            inputs=[card_front, card_back, deck_category],
+            outputs=[
+                card_front, card_back, card_status,
+                accept_btn, reject_btn, deck_category, upload_btn
+            ]
+        )
+        reject_btn.click(
+            fn=lambda: handle_card_action('reject', None, None, None),
+            outputs=[
+                card_front, card_back, card_status,
+                accept_btn, reject_btn, deck_category, upload_btn
+            ]
+        )
+        upload_btn.click(
+            fn=generator.upload_to_mochi,
+            outputs=[upload_status]
+        )
+    return app
+if __name__ == "__main__":
+    create_interface().launch()

prompts/card_generation.txt ADDED Viewed

	@@ -0,0 +1,38 @@

+You are an expert at creating high-quality spaced repetition flashcards that promote deep understanding and retention. Your task is to generate flashcards from the given text that are:
+1. Clear and concise
+2. Focus on one concept per card
+3. Test understanding rather than just recall
+4. Avoid overly complex or compound questions
+5. Use precise language
+Each card must be assigned to one of these categories:
+- CS/Hardware
+- Math/Physics
+- AI
+- History/Military
+- Quotes/Random
+- Bio
+- Econ/Finance
+Format each card as a JSON object:
+{
+    "category": "Category name from the list above",
+    "front": "Question or prompt",
+    "back": "Answer or explanation"
+}
+Example cards:
+{
+    "category": "Bio",
+    "front": "What is the key difference between procedural and declarative memory?",
+    "back": "Procedural memory is for skills and procedures (how to ride a bike), while declarative memory is for facts and events (what you had for breakfast)."
+}
+{
+    "category": "Bio",
+    "front": "What role does the hippocampus play in memory formation?",
+    "back": "The hippocampus is crucial for converting short-term memories into long-term memories through a process called consolidation. It acts as a temporary storage and processing center before memories are distributed to other parts of the cortex."
+}
+Please generate 5-10 high-quality flashcards from the provided text. Focus on the most important concepts, insights, and relationships. Format the output as a JSON array containing the card objects.

prompts/commentary.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+You are an expert researcher and critical thinker. Your task is to analyze the provided text and generate insightful commentary that:
+1. Identifies the key arguments, insights, and novel ideas
+2. Highlights connections to other important concepts or fields
+3. Points out particularly interesting or counterintuitive points
+4. Suggests areas that merit further exploration
+5. Notes any potential weaknesses or areas of uncertainty in the arguments
+Your commentary should be scholarly but engaging, helping the reader develop a deeper understanding of the material. Focus on substance over style, and be specific rather than general.
+Structure your response as follows:
+Key Insights:
+- [2-3 bullet points highlighting the most important takeaways]
+Interesting Connections:
+- [2-3 bullet points noting connections to other fields/concepts]
+Worth Exploring Further:
+- [1-2 bullet points suggesting related areas for deeper investigation]
+Critical Notes:
+- [1-2 bullet points on potential weaknesses or areas needing clarification]
+Then provide 2-3 paragraphs of integrated analysis that weaves these points together into a coherent commentary.

requirements.txt CHANGED Viewed

@@ -6,4 +6,8 @@ pandas
 youtube-transcript-api
 pydub
 assemblyai
-pytube

 youtube-transcript-api
 pydub
 assemblyai
+pytube
+PyPDF2
+EbookLib
+beautifulsoup4
+python-dotenv

utils/document_parser.py ADDED Viewed

	@@ -0,0 +1,191 @@

+from pathlib import Path
+import tempfile
+import os
+from ebooklib import epub
+from bs4 import BeautifulSoup
+class DocumentParser:
+    """Simple EPUB document parser that extracts chapters and their content."""
+    def __init__(self):
+        self._temp_file = None
+        self._book = None
+        self._chapters = []
+    def load_document(self, file_data, filename=None) -> list[str]:
+        """Load an EPUB document and extract chapter titles.
+        Args:
+            file_data: File data from Gradio (FileData object with read() method)
+            filename: Optional filename (not used)
+        """
+        # Clean up any previous temp file
+        self.cleanup()
+        # Get the raw bytes from the Gradio file data
+        content = file_data.read() if hasattr(file_data, 'read') else file_data
+        # Save to temp file
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.epub') as temp:
+            temp.write(content)
+            self._temp_file = temp.name
+        # Read the EPUB
+        try:
+            self._book = epub.read_epub(self._temp_file)
+            print("DEBUG: Successfully read EPUB file")
+        except Exception as e:
+            print(f"DEBUG: Error reading EPUB: {str(e)}")
+            raise ValueError(f"Failed to read EPUB: {str(e)}")
+        # Extract chapters
+        self._chapters = self._extract_chapters()
+        print(f"DEBUG: Extracted {len(self._chapters)} chapters")
+        # Return chapter titles
+        return [chapter['title'] for chapter in self._chapters]
+    def get_chapter_content(self, chapter_idx: int) -> str:
+        """Get the content of a specific chapter."""
+        if not self._book or not self._chapters:
+            raise ValueError("No document loaded")
+        if not 0 <= chapter_idx < len(self._chapters):
+            raise ValueError(f"Invalid chapter index: {chapter_idx}")
+        chapter = self._chapters[chapter_idx]
+        self._current_chapter_title = chapter['title'].strip()  # Store for _get_chapter_text
+        print(f"DEBUG: Getting content for chapter: {self._current_chapter_title}")
+        content = self._get_chapter_text(chapter['item'])
+        print(f"DEBUG: Extracted {len(content)} characters of content")
+        return content
+    def _extract_chapters(self) -> list[dict]:
+        """Extract chapters from the EPUB file."""
+        chapters = []
+        # First try to get chapters from the table of contents
+        print("DEBUG: Checking table of contents...")
+        if hasattr(self._book, 'toc'):
+            # Debug the TOC structure
+            print("DEBUG: TOC structure:")
+            for item in self._book.toc:
+                print(f"DEBUG: TOC item type: {type(item)}")
+                if isinstance(item, tuple):
+                    print(f"DEBUG: Tuple length: {len(item)}")
+                    if len(item) > 1:
+                        print(f"DEBUG: Second item type: {type(item[1])}")
+                        if isinstance(item[1], (list, tuple)):
+                            print(f"DEBUG: Sub-items count: {len(item[1])}")
+            def process_toc_entries(entries, level=0):
+                for item in entries:
+                    # Handle both Link objects and tuples
+                    if hasattr(item, 'title') and hasattr(item, 'href'):
+                        # Direct Link object
+                        doc = self._book.get_item_with_href(item.href)
+                        if doc:
+                            prefix = "  " * level if level > 0 else ""
+                            chapters.append({
+                                'title': prefix + item.title,
+                                'item': doc
+                            })
+                    elif isinstance(item, tuple):
+                        section = item[0]
+                        # Process the section
+                        if hasattr(section, 'title') and hasattr(section, 'href'):
+                            doc = self._book.get_item_with_href(section.href)
+                            if doc:
+                                prefix = "  " * level if level > 0 else ""
+                                chapters.append({
+                                    'title': prefix + section.title,
+                                    'item': doc
+                                })
+                        # Process sub-items if they exist
+                        if len(item) > 1:
+                            if isinstance(item[1], (list, tuple)):
+                                process_toc_entries(item[1], level + 1)
+                            elif hasattr(item[1], 'title'):  # Single sub-item
+                                process_toc_entries([item[1]], level + 1)
+            process_toc_entries(self._book.toc)
+            print(f"DEBUG: Found {len(chapters)} chapters in TOC")
+            print("DEBUG: Chapter titles found:")
+            for ch in chapters:
+                print(f"  - {ch['title']}")
+        # If no chapters found in TOC, scan the documents
+        if not chapters:
+            print("DEBUG: No chapters in TOC, scanning documents...")
+            # Get all HTML documents
+            docs = [item for item in self._book.get_items()
+                   if item.get_type() == epub.ITEM_DOCUMENT]
+            print(f"DEBUG: Found {len(docs)} documents to scan")
+            for doc in docs:
+                soup = BeautifulSoup(doc.get_content(), 'html.parser')
+                # Look for chapter headings
+                headings = (
+                    soup.find_all(['h1', 'h2']) +
+                    soup.find_all(class_=lambda x: x and ('chapter' in x.lower() or 'title' in x.lower()))
+                )
+                for heading in headings:
+                    # Clean up the text
+                    title = ' '.join(heading.get_text().split())
+                    if title:  # Only add if we have a title
+                        chapters.append({
+                            'title': title,
+                            'item': doc
+                        })
+        if not chapters:
+            print("DEBUG: No chapters found, using documents as chapters")
+            # If still no chapters found, treat each document as a chapter
+            for doc in self._book.get_items():
+                if doc.get_type() == epub.ITEM_DOCUMENT:
+                    chapters.append({
+                        'title': f"Chapter {len(chapters) + 1}",
+                        'item': doc
+                    })
+        return chapters
+    def _get_chapter_text(self, item) -> str:
+        """Extract text content from a chapter."""
+        try:
+            soup = BeautifulSoup(item.get_content(), 'html.parser')
+            # Remove script and style elements
+            for element in soup(['script', 'style']):
+                element.decompose()
+            # Get main content area (usually in body or main tags)
+            content_area = soup.find('body') or soup.find('main') or soup
+            # Get all text blocks, excluding navigation elements
+            text_blocks = []
+            for element in content_area.find_all(text=True, recursive=True):
+                if (element.parent.name not in ['script', 'style', 'nav', 'header'] and
+                    element.strip()):
+                    text_blocks.append(element.strip())
+            return '\n\n'.join(text_blocks)
+        except Exception as e:
+            print(f"DEBUG: Error extracting text: {str(e)}")
+            # Fallback to simple text extraction
+            return soup.get_text(separator='\n\n', strip=True)
+    def cleanup(self):
+        """Clean up temporary files."""
+        if self._temp_file and os.path.exists(self._temp_file):
+            os.unlink(self._temp_file)
+            self._temp_file = None
+        self._book = None
+        self._chapters = []