Spaces:

dwarkesh
/

producer

Running

App Files Files Community

dwarkesh commited on Dec 24, 2024

Commit

0fd2cd9

1 Parent(s): 73f2e8d

it works

Browse files

Files changed (1) hide show

scripts/transcript.py +92 -149

scripts/transcript.py CHANGED Viewed

@@ -4,18 +4,19 @@ from pathlib import Path
 import json
 import hashlib
 import os
-from typing import List, Optional
 import assemblyai as aai
 from google import generativeai
 from pydub import AudioSegment
 import asyncio
 import io
 @dataclass
 class Utterance:
     """A single utterance from a speaker"""
     speaker: str
     text: str
     start: int  # timestamp in ms from AssemblyAI
@@ -24,11 +25,10 @@ class Utterance:
     @property
     def timestamp(self) -> str:
         """Format start time as HH:MM:SS"""
-        seconds = int(self.start // 1000)  # Convert ms to seconds
         hours = seconds // 3600
         minutes = (seconds % 3600) // 60
         seconds = seconds % 60
         return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
@@ -42,49 +42,33 @@ class Transcriber:
     def get_transcript(self, audio_path: Path) -> List[Utterance]:
         """Get transcript, using cache if available"""
-        cached = self._get_cached(audio_path)
-        if cached:
-            print("Using cached AssemblyAI transcript...")
-            return cached
-        print("Getting new transcript from AssemblyAI...")
-        return self._get_fresh(audio_path)
-    def _get_cached(self, audio_path: Path) -> Optional[List[Utterance]]:
-        """Try to get transcript from cache"""
         cache_file = self.cache_dir / f"{audio_path.stem}.json"
-        if not cache_file.exists():
-            return None
-        with open(cache_file) as f:
-            data = json.load(f)
-            if data["hash"] != self._get_file_hash(audio_path):
-                return None
-            return [Utterance(**u) for u in data["utterances"]]
-    def _get_fresh(self, audio_path: Path) -> List[Utterance]:
-        """Get new transcript from AssemblyAI"""
         config = aai.TranscriptionConfig(speaker_labels=True, language_code="en")
         transcript = aai.Transcriber().transcribe(str(audio_path), config=config)
         utterances = [
             Utterance(speaker=u.speaker, text=u.text, start=u.start, end=u.end)
             for u in transcript.utterances
         ]
-        self._save_cache(audio_path, utterances)
-        return utterances
-    def _save_cache(self, audio_path: Path, utterances: List[Utterance]) -> None:
-        """Save transcript to cache"""
-        cache_file = self.cache_dir / f"{audio_path.stem}.json"
-        data = {
             "hash": self._get_file_hash(audio_path),
             "utterances": [vars(u) for u in utterances],
         }
         with open(cache_file, "w") as f:
-            json.dump(data, f, indent=2)
     def _get_file_hash(self, file_path: Path) -> str:
         """Calculate MD5 hash of a file"""
@@ -101,70 +85,67 @@ class Enhancer:
     def __init__(self, api_key: str):
         generativeai.configure(api_key=api_key)
         self.model = generativeai.GenerativeModel("gemini-exp-1206")
-        # Update prompt path
-        prompt_path = Path("prompts/enhance.txt")
-        self.prompt = prompt_path.read_text()
-    async def enhance_chunks(self, chunks: List[tuple[str, io.BytesIO]]) -> List[str]:
-        """Enhance multiple transcript chunks in parallel"""
         print(f"Enhancing {len(chunks)} chunks...")
-        async def process_chunk(chunk, index):
             text, audio = chunk
-            try:
-                result = await self._enhance_chunk_with_retry(text, audio)
-                print(f"Completed chunk {index + 1}/{len(chunks)}")
-                if result == text:  # Check if output matches input exactly
-                    print("WARNING: Enhanced text matches input exactly!")
-                return result
-            except Exception as e:
-                print(f"Error in chunk {index + 1}: {e}")
-                return None
-        # Create all tasks at once and wait for them all to complete
-        tasks = [process_chunk(chunk, i) for i, chunk in enumerate(chunks)]
-        results = await asyncio.gather(*tasks)
-        # Filter out failed chunks
-        return [r for r in results if r is not None]
-    async def _enhance_chunk_with_retry(self, text: str, audio: io.BytesIO, max_retries: int = 3) -> Optional[str]:
-        """Enhance a single chunk with retries"""
-        for attempt in range(max_retries):
-            try:
-                return await self._enhance_chunk(text, audio)
-            except Exception as e:
-                if attempt == max_retries - 1:
-                    print(f"Failed after {max_retries} attempts: {e}")
-                    return None
-                print(f"Attempt {attempt + 1} failed: {e}. Retrying...")
-                await asyncio.sleep(2 ** attempt)  # Exponential backoff
-    async def _enhance_chunk(self, text: str, audio: io.BytesIO) -> str:
-        """Enhance a single chunk"""
-        audio.seek(0)
-        response = await self.model.generate_content_async(
-            [self.prompt, text, {"mime_type": "audio/mp3", "data": audio.read()}]
-        )
-        return response.text
-def prepare_audio_chunks(audio_path: Path, utterances: List[Utterance]) -> List[tuple[str, io.BytesIO]]:
     """Prepare audio chunks and their corresponding text"""
-    def chunk_utterances(utterances: List[Utterance]) -> List[List[Utterance]]:
         chunks = []
         current = []
         text_length = 0
         for u in utterances:
-            # Check if adding this utterance would exceed token limit
             new_length = text_length + len(u.text)
-            if not current or new_length > 8000:  # ~2000 tokens
-                if current:
-                    chunks.append(current)
                 current = [u]
                 text_length = len(u.text)
             else:
@@ -173,99 +154,61 @@ def prepare_audio_chunks(audio_path: Path, utterances: List[Utterance]) -> List[
         if current:
             chunks.append(current)
         return chunks
     # Split utterances into chunks
     chunks = chunk_utterances(utterances)
-    # Load audio file once
     audio = AudioSegment.from_file(audio_path)
-    # Prepare segments
-    print(f"Preparing {len(chunks)} audio segments...")
     prepared = []
     for chunk in chunks:
-        # Extract audio segment
-        start_ms = chunk[0].start
-        end_ms = chunk[-1].end
-        segment = audio[start_ms:end_ms]
-        # Export to buffer
         buffer = io.BytesIO()
-        segment.export(buffer, format="mp3")
-        # Format text
-        text = format_transcript(chunk)
-        prepared.append((text, buffer))
     return prepared
-def format_transcript(utterances: List[Utterance]) -> str:
-    """Format utterances into readable text"""
-    sections = []
-    current_speaker = None
-    current_texts = []
-    for u in utterances:
-        # When speaker changes, output the accumulated text
-        if current_speaker != u.speaker:
-            if current_texts:  # Don't output empty sections
-                sections.append(f"Speaker {current_speaker} {utterances[len(sections)].timestamp}\n\n{''.join(current_texts)}")
-            current_speaker = u.speaker
-            current_texts = []
-        current_texts.append(u.text)
-    # Don't forget the last section
-    if current_texts:
-        sections.append(f"Speaker {current_speaker} {utterances[len(sections)].timestamp}\n\n{''.join(current_texts)}")
-    return "\n\n".join(sections)
 def main():
-    def setup_args() -> Path:
-        parser = argparse.ArgumentParser()
-        parser.add_argument("audio_file", help="Audio file to transcribe")
-        args = parser.parse_args()
-        audio_path = Path(args.audio_file)
-        if not audio_path.exists():
-            raise FileNotFoundError(f"File not found: {audio_path}")
-        return audio_path
-    def setup_output_dir() -> Path:
-        out_dir = Path("output/transcripts")
-        out_dir.mkdir(parents=True, exist_ok=True)
-        return out_dir
     try:
-        # Setup
-        audio_path = setup_args()
-        out_dir = setup_output_dir()
-        # Initialize services
         transcriber = Transcriber(os.getenv("ASSEMBLYAI_API_KEY"))
-        enhancer = Enhancer(os.getenv("GOOGLE_API_KEY"))
-        # Process
         utterances = transcriber.get_transcript(audio_path)
-        chunks = prepare_audio_chunks(audio_path, utterances)
         # Save original transcript
-        original = format_transcript(utterances)
         (out_dir / "autogenerated-transcript.md").write_text(original)
-        # Enhance and save
         enhanced = asyncio.run(enhancer.enhance_chunks(chunks))
-        merged_transcript = "\n\n".join(chunk.strip() for chunk in enhanced)
-        (out_dir / "transcript.md").write_text(merged_transcript)
         print("\nTranscripts saved to:")
-        print("- output/transcripts/autogenerated-transcript.md")
-        print("- output/transcripts/transcript.md")
     except Exception as e:
         print(f"Error: {e}")

 import json
 import hashlib
 import os
+from typing import List, Tuple
 import assemblyai as aai
 from google import generativeai
 from pydub import AudioSegment
 import asyncio
 import io
+from multiprocessing import Pool
+from functools import partial
 @dataclass
 class Utterance:
     """A single utterance from a speaker"""
     speaker: str
     text: str
     start: int  # timestamp in ms from AssemblyAI
     @property
     def timestamp(self) -> str:
         """Format start time as HH:MM:SS"""
+        seconds = int(self.start // 1000)
         hours = seconds // 3600
         minutes = (seconds % 3600) // 60
         seconds = seconds % 60
         return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
     def get_transcript(self, audio_path: Path) -> List[Utterance]:
         """Get transcript, using cache if available"""
         cache_file = self.cache_dir / f"{audio_path.stem}.json"
+        if cache_file.exists():
+            with open(cache_file) as f:
+                data = json.load(f)
+                if data["hash"] == self._get_file_hash(audio_path):
+                    print("Using cached AssemblyAI transcript...")
+                    return [Utterance(**u) for u in data["utterances"]]
+        print("Getting new transcript from AssemblyAI...")
         config = aai.TranscriptionConfig(speaker_labels=True, language_code="en")
         transcript = aai.Transcriber().transcribe(str(audio_path), config=config)
         utterances = [
             Utterance(speaker=u.speaker, text=u.text, start=u.start, end=u.end)
             for u in transcript.utterances
         ]
+        # Cache the result
+        cache_data = {
             "hash": self._get_file_hash(audio_path),
             "utterances": [vars(u) for u in utterances],
         }
         with open(cache_file, "w") as f:
+            json.dump(cache_data, f, indent=2)
+        return utterances
     def _get_file_hash(self, file_path: Path) -> str:
         """Calculate MD5 hash of a file"""
     def __init__(self, api_key: str):
         generativeai.configure(api_key=api_key)
         self.model = generativeai.GenerativeModel("gemini-exp-1206")
+        self.prompt = Path("prompts/enhance.txt").read_text()
+    async def enhance_chunks(self, chunks: List[Tuple[str, io.BytesIO]]) -> List[str]:
+        """Enhance multiple transcript chunks concurrently with concurrency control"""
         print(f"Enhancing {len(chunks)} chunks...")
+        # Create a semaphore to limit concurrent requests
+        semaphore = asyncio.Semaphore(3)  # Allow up to 3 concurrent requests
+        async def process_chunk(i: int, chunk: Tuple[str, io.BytesIO]) -> str:
             text, audio = chunk
+            async with semaphore:
+                audio.seek(0)
+                response = await self.model.generate_content_async(
+                    [self.prompt, text, {"mime_type": "audio/mp3", "data": audio.read()}]
+                )
+                print(f"Completed chunk {i+1}/{len(chunks)}")
+                return response.text
+        # Create tasks for all chunks and run them concurrently
+        tasks = [
+            process_chunk(i, chunk)
+            for i, chunk in enumerate(chunks)
+        ]
+        # Wait for all tasks to complete
+        results = await asyncio.gather(*tasks)
+        return results
+def format_chunk(utterances: List[Utterance]) -> str:
+    """Format utterances into readable text with timestamps"""
+    sections = []
+    current_speaker = None
+    current_texts = []
+    for u in utterances:
+        if current_speaker != u.speaker:
+            if current_texts:
+                sections.append(f"Speaker {current_speaker} {utterances[len(sections)].timestamp}\n\n{''.join(current_texts)}")
+            current_speaker = u.speaker
+            current_texts = []
+        current_texts.append(u.text)
+    if current_texts:
+        sections.append(f"Speaker {current_speaker} {utterances[len(sections)].timestamp}\n\n{''.join(current_texts)}")
+    return "\n\n".join(sections)
+def prepare_audio_chunks(audio_path: Path, utterances: List[Utterance]) -> List[Tuple[str, io.BytesIO]]:
     """Prepare audio chunks and their corresponding text"""
+    def chunk_utterances(utterances: List[Utterance], max_tokens: int = 8000) -> List[List[Utterance]]:
         chunks = []
         current = []
         text_length = 0
         for u in utterances:
             new_length = text_length + len(u.text)
+            if current and new_length > max_tokens:
+                chunks.append(current)
                 current = [u]
                 text_length = len(u.text)
             else:
         if current:
             chunks.append(current)
         return chunks
     # Split utterances into chunks
     chunks = chunk_utterances(utterances)
+    print(f"Preparing {len(chunks)} audio segments...")
+    # Load audio once
     audio = AudioSegment.from_file(audio_path)
+    # Process each chunk
     prepared = []
     for chunk in chunks:
+        # Extract just the needed segment
+        segment = audio[chunk[0].start:chunk[-1].end]
         buffer = io.BytesIO()
+        # Use lower quality MP3 for faster processing
+        segment.export(buffer, format="mp3", parameters=["-q:a", "9"])
+        prepared.append((format_chunk(chunk), buffer))
     return prepared
 def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("audio_file", help="Audio file to transcribe")
+    args = parser.parse_args()
+    audio_path = Path(args.audio_file)
+    if not audio_path.exists():
+        raise FileNotFoundError(f"File not found: {audio_path}")
+    out_dir = Path("output/transcripts")
+    out_dir.mkdir(parents=True, exist_ok=True)
     try:
+        # Get transcript
         transcriber = Transcriber(os.getenv("ASSEMBLYAI_API_KEY"))
         utterances = transcriber.get_transcript(audio_path)
         # Save original transcript
+        original = format_chunk(utterances)
         (out_dir / "autogenerated-transcript.md").write_text(original)
+        # Enhance transcript
+        enhancer = Enhancer(os.getenv("GOOGLE_API_KEY"))
+        chunks = prepare_audio_chunks(audio_path, utterances)
         enhanced = asyncio.run(enhancer.enhance_chunks(chunks))
+        # Save enhanced transcript
+        merged = "\n\n".join(chunk.strip() for chunk in enhanced)
+        (out_dir / "transcript.md").write_text(merged)
         print("\nTranscripts saved to:")
+        print(f"- {out_dir}/autogenerated-transcript.md")
+        print(f"- {out_dir}/transcript.md")
     except Exception as e:
         print(f"Error: {e}")