dwarkesh commited on
Commit
3e576d0
·
1 Parent(s): 48f04c5

prompt is even better

Browse files
Files changed (2) hide show
  1. .gitignore +2 -0
  2. transcript.py +72 -49
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ transcript.md
2
+ autogenerated-transcript.md
transcript.py CHANGED
@@ -1,8 +1,12 @@
1
- import gradio as gr
2
  import assemblyai as aai
3
  from google import generativeai
4
  import os
5
  from pydub import AudioSegment
 
 
 
 
6
 
7
  # Initialize API clients
8
  ASSEMBLYAI_API_KEY = os.getenv("ASSEMBLYAI_API_KEY")
@@ -78,12 +82,15 @@ Note: Below you'll find an auto-generated transcript that may help with speaker
78
  Please:
79
  1. Fix speaker attribution errors, especially at segment boundaries. Watch for incomplete thoughts that were likely from the previous speaker.
80
 
81
- 2. Optimize for readability over verbatim accuracy:
82
- - Remove filler words (um, uh, like, you know)
83
- - Eliminate false starts and repetitions
84
- - Convert rambling sentences into clear, concise statements
85
- - Break up run-on sentences into shorter ones
86
- - Maintain natural conversation flow while improving clarity
 
 
 
87
 
88
  3. Format the output consistently:
89
  - Keep the "Speaker X 00:00:00" format (no brackets, no other formatting)
@@ -103,7 +110,7 @@ Speaker A 00:01:15
103
 
104
  When we look at the data, we see a consistent pattern in the results.
105
 
106
- And when we examine the second part of the analysis, it reveals a completely different finding.
107
 
108
  Enhance the following transcript, starting directly with the speaker format:
109
  """
@@ -114,7 +121,7 @@ Enhance the following transcript, starting directly with the speaker format:
114
  return response.text
115
 
116
 
117
- def create_chunks(utterances, target_tokens=7500):
118
  """Create chunks of utterances that fit within token limits"""
119
  chunks = []
120
  current_chunk = []
@@ -156,6 +163,14 @@ def create_chunks(utterances, target_tokens=7500):
156
  return chunks
157
 
158
 
 
 
 
 
 
 
 
 
159
  def process_audio(audio_path):
160
  """Main processing pipeline"""
161
  print("Stage 1: Getting raw transcript from AssemblyAI...")
@@ -163,34 +178,43 @@ def process_audio(audio_path):
163
 
164
  print("Stage 2: Processing in chunks...")
165
  chunks = create_chunks(transcript_data)
166
- original_chunks = []
167
- enhanced_chunks = []
168
 
169
- for i, chunk in enumerate(chunks):
170
- # Get original chunk
171
- chunk_text = format_transcript(chunk["utterances"])
172
- original_chunks.append(chunk_text)
173
 
174
- # Process enhanced version
175
- print(f"Processing chunk {i+1} of {len(chunks)}...")
176
- audio_segment = get_audio_segment(audio_path, chunk["start"], chunk["end"])
177
- enhanced_chunk = enhance_transcript(chunk_text, audio_segment)
178
- enhanced_chunks.append(enhanced_chunk)
179
 
180
- return "\n".join(original_chunks), "\n".join(enhanced_chunks)
 
 
 
 
 
181
 
 
 
182
 
183
- def handle_upload(audio):
184
- """Handle Gradio interface uploads"""
185
- if audio is None:
186
- return "Please upload an audio file.", "Please upload an audio file."
 
187
 
188
- try:
189
- original, enhanced = process_audio(audio)
190
- return original, enhanced
191
- except Exception as e:
192
- error_msg = f"Error processing audio: {str(e)}"
193
- return error_msg, error_msg
 
 
 
 
 
 
194
 
195
 
196
  def get_audio_segment(audio_path, start_time, end_time):
@@ -201,23 +225,22 @@ def get_audio_segment(audio_path, start_time, end_time):
201
  return audio[start_ms:end_ms].export(format="mp3")
202
 
203
 
204
- # Create Gradio interface
205
- iface = gr.Interface(
206
- fn=handle_upload,
207
- inputs=gr.Audio(type="filepath"),
208
- outputs=[
209
- gr.Textbox(label="Original Transcript", container=False),
210
- gr.Textbox(label="Enhanced Transcript", container=False),
211
- ],
212
- title="Audio Transcript Enhancement",
213
- description="Upload an MP3 file to get both the original and enhanced transcripts using AssemblyAI and Gemini.",
214
- cache_examples=False,
215
- allow_flagging="never",
216
- theme=gr.themes.Default(
217
- spacing_size="sm",
218
- text_size="sm",
219
- ),
220
- )
221
 
222
  if __name__ == "__main__":
223
- iface.launch()
 
1
+ import argparse
2
  import assemblyai as aai
3
  from google import generativeai
4
  import os
5
  from pydub import AudioSegment
6
+ import concurrent.futures
7
+
8
+ # Suppress gRPC shutdown warnings
9
+ os.environ["GRPC_PYTHON_LOG_LEVEL"] = "error"
10
 
11
  # Initialize API clients
12
  ASSEMBLYAI_API_KEY = os.getenv("ASSEMBLYAI_API_KEY")
 
82
  Please:
83
  1. Fix speaker attribution errors, especially at segment boundaries. Watch for incomplete thoughts that were likely from the previous speaker.
84
 
85
+ 2. Optimize AGGRESSIVELY for readability over verbatim accuracy:
86
+ - Readability is the most important thing!!
87
+ - Remove ALL conversational artifacts (yeah, so, I mean, etc.)
88
+ - Remove ALL filler words (um, uh, like, you know)
89
+ - Remove false starts and self-corrections completely
90
+ - Remove redundant phrases and hesitations
91
+ - Convert any indirect or rambling responses into direct statements
92
+ - Break up run-on sentences into clear, concise statements
93
+ - Maintain natural conversation flow while prioritizing clarity and directness
94
 
95
  3. Format the output consistently:
96
  - Keep the "Speaker X 00:00:00" format (no brackets, no other formatting)
 
110
 
111
  When we look at the data, we see a consistent pattern in the results.
112
 
113
+ When we examine the second part of the analysis, it reveals a completely different finding.
114
 
115
  Enhance the following transcript, starting directly with the speaker format:
116
  """
 
121
  return response.text
122
 
123
 
124
+ def create_chunks(utterances, target_tokens=2000):
125
  """Create chunks of utterances that fit within token limits"""
126
  chunks = []
127
  current_chunk = []
 
163
  return chunks
164
 
165
 
166
+ def process_chunk(chunk_data):
167
+ """Process a single chunk with Gemini"""
168
+ audio_path, chunk = chunk_data
169
+ chunk_text = format_transcript(chunk["utterances"])
170
+ audio_segment = get_audio_segment(audio_path, chunk["start"], chunk["end"])
171
+ return enhance_transcript(chunk_text, audio_segment)
172
+
173
+
174
  def process_audio(audio_path):
175
  """Main processing pipeline"""
176
  print("Stage 1: Getting raw transcript from AssemblyAI...")
 
178
 
179
  print("Stage 2: Processing in chunks...")
180
  chunks = create_chunks(transcript_data)
 
 
181
 
182
+ # Get original transcript
183
+ original_chunks = [format_transcript(chunk["utterances"]) for chunk in chunks]
184
+ original_transcript = "\n".join(original_chunks)
 
185
 
186
+ # Process enhanced versions in parallel
187
+ print(f"Stage 3: Enhancing {len(chunks)} chunks in parallel...")
188
+ chunk_data = [(audio_path, chunk) for chunk in chunks]
 
 
189
 
190
+ # Use max_workers=None to allow as many threads as needed
191
+ with concurrent.futures.ThreadPoolExecutor(max_workers=None) as executor:
192
+ # Submit all tasks and store with their original indices
193
+ future_to_index = {
194
+ executor.submit(process_chunk, data): i for i, data in enumerate(chunk_data)
195
+ }
196
 
197
+ # Create a list to store results in order
198
+ enhanced_chunks = [None] * len(chunks)
199
 
200
+ # Process results as they complete
201
+ for future in concurrent.futures.as_completed(future_to_index):
202
+ index = future_to_index[future]
203
+ print(f"Completed chunk {index + 1}/{len(chunks)}")
204
+ enhanced_chunks[index] = future.result()
205
 
206
+ enhanced_transcript = "\n".join(enhanced_chunks)
207
+
208
+ # Write transcripts to files
209
+ with open("autogenerated-transcript.md", "w", encoding="utf-8") as f:
210
+ f.write(original_transcript)
211
+
212
+ with open("transcript.md", "w", encoding="utf-8") as f:
213
+ f.write(enhanced_transcript)
214
+
215
+ print("\nTranscripts have been saved to:")
216
+ print("- autogenerated-transcript.md")
217
+ print("- transcript.md")
218
 
219
 
220
  def get_audio_segment(audio_path, start_time, end_time):
 
225
  return audio[start_ms:end_ms].export(format="mp3")
226
 
227
 
228
+ def main():
229
+ parser = argparse.ArgumentParser(
230
+ description="Generate enhanced transcripts from audio files"
231
+ )
232
+ parser.add_argument("audio_file", help="Path to the audio file to transcribe")
233
+ args = parser.parse_args()
234
+
235
+ if not os.path.exists(args.audio_file):
236
+ print(f"Error: File '{args.audio_file}' not found")
237
+ return
238
+
239
+ try:
240
+ process_audio(args.audio_file)
241
+ except Exception as e:
242
+ print(f"Error processing audio: {str(e)}")
243
+
 
244
 
245
  if __name__ == "__main__":
246
+ main()