dwarkesh commited on
Commit
d23d879
·
1 Parent(s): 0fd2cd9

transcript improvements

Browse files
Files changed (1) hide show
  1. scripts/transcript.py +114 -43
scripts/transcript.py CHANGED
@@ -4,7 +4,7 @@ from pathlib import Path
4
  import json
5
  import hashlib
6
  import os
7
- from typing import List, Tuple
8
  import assemblyai as aai
9
  from google import generativeai
10
  from pydub import AudioSegment
@@ -12,6 +12,7 @@ import asyncio
12
  import io
13
  from multiprocessing import Pool
14
  from functools import partial
 
15
 
16
 
17
  @dataclass
@@ -49,21 +50,43 @@ class Transcriber:
49
  data = json.load(f)
50
  if data["hash"] == self._get_file_hash(audio_path):
51
  print("Using cached AssemblyAI transcript...")
52
- return [Utterance(**u) for u in data["utterances"]]
 
 
 
 
 
 
 
 
 
53
 
54
  print("Getting new transcript from AssemblyAI...")
55
  config = aai.TranscriptionConfig(speaker_labels=True, language_code="en")
56
  transcript = aai.Transcriber().transcribe(str(audio_path), config=config)
57
 
58
  utterances = [
59
- Utterance(speaker=u.speaker, text=u.text, start=u.start, end=u.end)
 
 
 
 
 
60
  for u in transcript.utterances
61
  ]
62
 
63
- # Cache the result
64
  cache_data = {
65
  "hash": self._get_file_hash(audio_path),
66
- "utterances": [vars(u) for u in utterances],
 
 
 
 
 
 
 
 
67
  }
68
  with open(cache_file, "w") as f:
69
  json.dump(cache_data, f, indent=2)
@@ -115,49 +138,96 @@ class Enhancer:
115
  return results
116
 
117
 
118
- def format_chunk(utterances: List[Utterance]) -> str:
119
- """Format utterances into readable text with timestamps"""
120
- sections = []
121
- current_speaker = None
122
- current_texts = []
 
 
 
 
 
123
 
124
- for u in utterances:
125
- if current_speaker != u.speaker:
126
- if current_texts:
127
- sections.append(f"Speaker {current_speaker} {utterances[len(sections)].timestamp}\n\n{''.join(current_texts)}")
128
- current_speaker = u.speaker
129
- current_texts = []
130
- current_texts.append(u.text)
131
 
132
- if current_texts:
133
- sections.append(f"Speaker {current_speaker} {utterances[len(sections)].timestamp}\n\n{''.join(current_texts)}")
 
 
134
 
135
- return "\n\n".join(sections)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
 
138
  def prepare_audio_chunks(audio_path: Path, utterances: List[Utterance]) -> List[Tuple[str, io.BytesIO]]:
139
  """Prepare audio chunks and their corresponding text"""
140
- def chunk_utterances(utterances: List[Utterance], max_tokens: int = 8000) -> List[List[Utterance]]:
141
- chunks = []
142
- current = []
143
- text_length = 0
144
-
145
- for u in utterances:
146
- new_length = text_length + len(u.text)
147
- if current and new_length > max_tokens:
148
- chunks.append(current)
149
- current = [u]
150
- text_length = len(u.text)
151
- else:
152
- current.append(u)
153
- text_length = new_length
154
-
155
- if current:
156
- chunks.append(current)
157
- return chunks
158
-
159
- # Split utterances into chunks
160
- chunks = chunk_utterances(utterances)
161
  print(f"Preparing {len(chunks)} audio segments...")
162
 
163
  # Load audio once
@@ -172,7 +242,7 @@ def prepare_audio_chunks(audio_path: Path, utterances: List[Utterance]) -> List[
172
  # Use lower quality MP3 for faster processing
173
  segment.export(buffer, format="mp3", parameters=["-q:a", "9"])
174
  prepared.append((format_chunk(chunk), buffer))
175
-
176
  return prepared
177
 
178
 
@@ -194,7 +264,8 @@ def main():
194
  utterances = transcriber.get_transcript(audio_path)
195
 
196
  # Save original transcript
197
- original = format_chunk(utterances)
 
198
  (out_dir / "autogenerated-transcript.md").write_text(original)
199
 
200
  # Enhance transcript
 
4
  import json
5
  import hashlib
6
  import os
7
+ from typing import List, Tuple, Iterator
8
  import assemblyai as aai
9
  from google import generativeai
10
  from pydub import AudioSegment
 
12
  import io
13
  from multiprocessing import Pool
14
  from functools import partial
15
+ from itertools import groupby
16
 
17
 
18
  @dataclass
 
50
  data = json.load(f)
51
  if data["hash"] == self._get_file_hash(audio_path):
52
  print("Using cached AssemblyAI transcript...")
53
+ # Create proper Utterance objects from cached data
54
+ return [
55
+ Utterance(
56
+ speaker=u["speaker"],
57
+ text=u["text"],
58
+ start=u["start"],
59
+ end=u["end"]
60
+ )
61
+ for u in data["utterances"]
62
+ ]
63
 
64
  print("Getting new transcript from AssemblyAI...")
65
  config = aai.TranscriptionConfig(speaker_labels=True, language_code="en")
66
  transcript = aai.Transcriber().transcribe(str(audio_path), config=config)
67
 
68
  utterances = [
69
+ Utterance(
70
+ speaker=u.speaker,
71
+ text=u.text,
72
+ start=u.start,
73
+ end=u.end
74
+ )
75
  for u in transcript.utterances
76
  ]
77
 
78
+ # Cache the raw utterance data
79
  cache_data = {
80
  "hash": self._get_file_hash(audio_path),
81
+ "utterances": [
82
+ {
83
+ "speaker": u.speaker,
84
+ "text": u.text,
85
+ "start": u.start,
86
+ "end": u.end
87
+ }
88
+ for u in utterances
89
+ ]
90
  }
91
  with open(cache_file, "w") as f:
92
  json.dump(cache_data, f, indent=2)
 
138
  return results
139
 
140
 
141
+ @dataclass
142
+ class SpeakerDialogue:
143
+ """Represents a continuous section of speech from a single speaker"""
144
+ speaker: str
145
+ utterances: List[Utterance]
146
+
147
+ @property
148
+ def start(self) -> int:
149
+ """Start time of first utterance"""
150
+ return self.utterances[0].start
151
 
152
+ @property
153
+ def end(self) -> int:
154
+ """End time of last utterance"""
155
+ return self.utterances[-1].end
 
 
 
156
 
157
+ @property
158
+ def timestamp(self) -> str:
159
+ """Format start time as HH:MM:SS"""
160
+ return self.utterances[0].timestamp
161
 
162
+ def format(self) -> str:
163
+ """Format this dialogue as text with newlines between utterances"""
164
+ texts = [u.text + "\n\n" for u in self.utterances] # Add two newlines after each utterance
165
+ combined_text = ''.join(texts).rstrip() # Remove trailing whitespace at the end
166
+ return f"Speaker {self.speaker} {self.timestamp}\n\n{combined_text}"
167
+
168
+
169
+ def group_utterances_by_speaker(utterances: List[Utterance]) -> Iterator[SpeakerDialogue]:
170
+ """Group consecutive utterances by the same speaker"""
171
+ for speaker, group in groupby(utterances, key=lambda u: u.speaker):
172
+ yield SpeakerDialogue(speaker=speaker, utterances=list(group))
173
+
174
+
175
+ def estimate_tokens(text: str, chars_per_token: int = 4) -> int:
176
+ """
177
+ Estimate number of tokens in text
178
+ Args:
179
+ text: The text to estimate tokens for
180
+ chars_per_token: Estimated characters per token (default 4)
181
+ """
182
+ return (len(text) + chars_per_token - 1) // chars_per_token
183
+
184
+
185
+ def chunk_dialogues(
186
+ dialogues: Iterator[SpeakerDialogue],
187
+ max_tokens: int = 2000,
188
+ chars_per_token: int = 4
189
+ ) -> List[List[SpeakerDialogue]]:
190
+ """
191
+ Split dialogues into chunks that fit within token limit
192
+ Args:
193
+ dialogues: Iterator of SpeakerDialogues
194
+ max_tokens: Maximum tokens per chunk
195
+ chars_per_token: Estimated characters per token (default 4)
196
+ """
197
+ chunks = []
198
+ current_chunk = []
199
+ current_text = ""
200
+
201
+ for dialogue in dialogues:
202
+ # Format this dialogue
203
+ formatted = dialogue.format()
204
+
205
+ # If adding this dialogue would exceed token limit, start new chunk
206
+ new_text = current_text + "\n\n" + formatted if current_text else formatted
207
+ if current_chunk and estimate_tokens(new_text, chars_per_token) > max_tokens:
208
+ chunks.append(current_chunk)
209
+ current_chunk = [dialogue]
210
+ current_text = formatted
211
+ else:
212
+ current_chunk.append(dialogue)
213
+ current_text = new_text
214
+
215
+ if current_chunk:
216
+ chunks.append(current_chunk)
217
+
218
+ return chunks
219
+
220
+
221
+ def format_chunk(dialogues: List[SpeakerDialogue]) -> str:
222
+ """Format a chunk of dialogues into readable text"""
223
+ return "\n\n".join(dialogue.format() for dialogue in dialogues)
224
 
225
 
226
  def prepare_audio_chunks(audio_path: Path, utterances: List[Utterance]) -> List[Tuple[str, io.BytesIO]]:
227
  """Prepare audio chunks and their corresponding text"""
228
+ # Group utterances by speaker and split into chunks
229
+ dialogues = group_utterances_by_speaker(utterances)
230
+ chunks = chunk_dialogues(dialogues)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  print(f"Preparing {len(chunks)} audio segments...")
232
 
233
  # Load audio once
 
242
  # Use lower quality MP3 for faster processing
243
  segment.export(buffer, format="mp3", parameters=["-q:a", "9"])
244
  prepared.append((format_chunk(chunk), buffer))
245
+
246
  return prepared
247
 
248
 
 
264
  utterances = transcriber.get_transcript(audio_path)
265
 
266
  # Save original transcript
267
+ dialogues = list(group_utterances_by_speaker(utterances)) # Convert iterator to list
268
+ original = format_chunk(dialogues)
269
  (out_dir / "autogenerated-transcript.md").write_text(original)
270
 
271
  # Enhance transcript