dwarkesh commited on
Commit
dac6bda
·
1 Parent(s): 8db26e0

prompt doesn't work

Browse files
Files changed (3) hide show
  1. app.py +1 -1
  2. requirements.txt +7 -0
  3. transcript.py +240 -0
app.py CHANGED
@@ -52,7 +52,7 @@ current_prompts = DEFAULT_PROMPTS.copy()
52
  def load_examples(filename: str, columns: list) -> str:
53
  """Load examples from CSV file."""
54
  try:
55
- df = pd.read_csv(filename)
56
  if len(columns) == 1:
57
  examples = df[columns[0]].dropna().tolist()
58
  return "\n\n".join(examples)
 
52
  def load_examples(filename: str, columns: list) -> str:
53
  """Load examples from CSV file."""
54
  try:
55
+ df = pd.read_csv(f"source/{filename}")
56
  if len(columns) == 1:
57
  examples = df[columns[0]].dropna().tolist()
58
  return "\n\n".join(examples)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio
2
+ deepgram-sdk
3
+ google-generativeai
4
+ anthropic
5
+ pandas
6
+ youtube-transcript-api
7
+ pydub
transcript.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from deepgram import DeepgramClient, PrerecordedOptions
3
+ from google import generativeai
4
+ import os
5
+ from pydub import AudioSegment
6
+
7
+ # Initialize API clients
8
+ DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY")
9
+ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
10
+
11
+ dg_client = DeepgramClient(DEEPGRAM_API_KEY)
12
+ generativeai.configure(api_key=GOOGLE_API_KEY)
13
+ model = generativeai.GenerativeModel("gemini-2.0-flash-exp")
14
+
15
+
16
+ def format_timestamp(seconds):
17
+ """Convert seconds to HH:MM:SS format"""
18
+ h = int(float(seconds)) // 3600
19
+ m = (int(float(seconds)) % 3600) // 60
20
+ s = int(float(seconds)) % 60
21
+ return f"{h:02d}:{m:02d}:{s:02d}"
22
+
23
+
24
+ def get_transcript(audio_path):
25
+ """Get transcript from Deepgram with speaker diarization"""
26
+ with open(audio_path, "rb") as audio:
27
+ options = PrerecordedOptions(
28
+ smart_format=True,
29
+ diarize=True,
30
+ utterances=True,
31
+ model="nova-2",
32
+ language="en-US",
33
+ )
34
+ response = dg_client.listen.rest.v("1").transcribe_file(
35
+ {"buffer": audio, "mimetype": "audio/mp3"}, options
36
+ )
37
+ return response.results.utterances
38
+
39
+
40
+ def format_transcript(utterances):
41
+ """Format transcript into readable text with speaker labels"""
42
+ formatted_sections = []
43
+ current_speaker = None
44
+ current_text = []
45
+ current_start = None
46
+
47
+ for utterance in utterances:
48
+ # If this is a new speaker
49
+ if current_speaker != utterance.speaker:
50
+ # Write out the previous section if it exists
51
+ if current_text:
52
+ timestamp = format_timestamp(current_start)
53
+ # Normalize spacing: single newline after timestamp, text joined with single spaces
54
+ section = f"Speaker {current_speaker} {timestamp}\n{' '.join(current_text).strip()}"
55
+ formatted_sections.append(section)
56
+ current_text = []
57
+
58
+ # Start new section
59
+ current_speaker = utterance.speaker
60
+ current_start = utterance.start
61
+
62
+ current_text.append(utterance.transcript.strip())
63
+
64
+ # Add the final section
65
+ if current_text:
66
+ timestamp = format_timestamp(current_start)
67
+ section = (
68
+ f"Speaker {current_speaker} {timestamp}\n{' '.join(current_text).strip()}"
69
+ )
70
+ formatted_sections.append(section)
71
+
72
+ return "\n\n".join(formatted_sections)
73
+
74
+
75
+ def enhance_transcript(chunk_text, audio_segment):
76
+ """Enhance transcript using Gemini AI with both text and audio"""
77
+ prompt = """As a professional transcript editor, enhance this transcript for maximum readability while preserving accuracy.
78
+
79
+ Key Instructions:
80
+ 1. Correct transcription errors using the audio
81
+ 2. Format for readability:
82
+ - Remove filler words (e.g., "um", "like", "you know")
83
+ - Remove repetitions and false starts
84
+ - Break into clear paragraphs
85
+ - Add punctuation and quotation marks
86
+ 3. Maintain exact speaker names and timestamps
87
+ 4. Fix speaker attribution errors by:
88
+ - Using the audio to verify who is actually speaking
89
+ - Moving text to the correct speaker's section if misattributed
90
+ - Never combining multiple speakers' text into one section
91
+ - These often happen at the end of a speaker's section or the beginning of the next speaker's section. Be aware of this!
92
+
93
+ Example:
94
+
95
+ <Original>
96
+ Dwarkesh 0:13:37
97
+ Let's let's go to World War 1 and World War 2. So I would, you know, I, I had on the, um, the
98
+ A couple of months ago, I interviewed the biographer of Churchill, Andrew Roberts, and we, as you discussed in your book, and he discusses, you know, Churchill was the sort of technological visionary, and that's the part of him that isn't talked about often. Um,
99
+ Of you maybe talk a little bit about what Churchill did and how he saw the power of oil. I think Churchill was
100
+
101
+ Daniel Yergin 0:14:04
102
+ the first Lord of the Admiralty, and he saw that if you can convert all the naval ships at that time ran on coal, which means you had to have people on board shoveling coal, and it took a long time to get the coal on board, and if you switch to oil, you would have faster, uh, the ships would be faster, they wouldn't need to take the same time. They wouldn't need to carry the same people. And so he made
103
+ The decision, obviously others like Admiral Jackie Fisher were pushing him to convert the Royal Navy to to oil and people saying this is treacherous because we'll depend upon oil from far away, from Persia, uh, rather than Welsh coal and uh he said, um, you know, he said, um this is the prize of the venture. That's where I got my title from originally it was going to be called The Prize of the Venture, because that's what he said and then I just made it the prize, but uh, he saw that.
104
+ During, uh, uh, World War 2, World War 1, he promoted another uh uh military development, um, I'm forgetting what it was called initially, but it eventually became known as the tank. I mean, so he really did kind of constantly push technology.
105
+ Why I don't know. I mean, he was actually, you know, was not, he was not educated, uh, as that he was educated and, you know, in the sort of classic I wrote so well, uh, but, uh, he understood technology and that you had a kind of constantly push for advantage.
106
+
107
+ </Original>
108
+
109
+ <Enhanced>
110
+ Dwarkesh Patel 00:13:37
111
+
112
+ Let's go to World War I and World War II. A couple months ago, I interviewed the biographer of Churchill, Andrew Roberts. As you discuss in your book, he discusses that Churchill was this sort of technological visionary and how that's a side of him that isn't talked about often. Maybe talk a little bit about what Churchill did and how he saw the power of oil.
113
+
114
+ Daniel Yergin 00:14:04
115
+
116
+ Churchill was the First Lord of the Admiralty. All the naval ships at that time ran on coal, which means you had to have people on board shoveling coal. It took a long time to get the coal on board. If you switched to oil, the ships would be faster. They wouldn't need to take the same time. They wouldn't need to carry the same people.
117
+
118
+ So he made the decision—obviously others like Admiral Jackie Fisher were pushing him—to convert the Royal Navy to oil. People were saying this is treacherous because we'll depend upon oil from far away, from Persia, rather than Welsh coal. He said, "This is the prize of the venture." That's where I got my title from. Originally it was going to be called "The Prize of the Venture" because that's what he said. Then I just made it The Prize.
119
+
120
+ During World War I, he promoted another military development. I'm forgetting what it was called initially, but it eventually became known as the tank. He really did constantly push technology. Why? I don't know. He was not educated like that. He was educated in the classic sense. That's why he wrote so well. But he understood technology and that you had to constantly push for advantage.
121
+
122
+ </Enhanced>
123
+
124
+ Notice how the enhanced version:
125
+ 1. Maintains exact speaker names and timestamps
126
+ 2. Removes filler words and repetitions
127
+ 3. Breaks long passages into logical paragraphs
128
+ 4. Adds proper punctuation and quotation marks
129
+ 6. Corrects speaker attribution errors.
130
+
131
+ Output only the enhanced transcript, maintaining speaker names and timestamps exactly as given.
132
+
133
+ """
134
+
135
+ response = model.generate_content(
136
+ [prompt, chunk_text, {"mime_type": "audio/mp3", "data": audio_segment.read()}]
137
+ )
138
+ return response.text
139
+
140
+
141
+ def create_chunks(utterances, target_tokens=7500):
142
+ """Create chunks of utterances that fit within token limits"""
143
+ chunks = []
144
+ current_chunk = []
145
+ current_start = None
146
+ current_end = None
147
+
148
+ for utterance in utterances:
149
+ # Start new chunk if this is first utterance
150
+ if not current_chunk:
151
+ current_start = utterance.start
152
+ current_chunk = [utterance]
153
+ current_end = utterance.end
154
+ # Check if adding this utterance would exceed token limit
155
+ elif (float(utterance.end) - float(current_start)) * 25 > target_tokens:
156
+ # Save current chunk and start new one
157
+ chunks.append(
158
+ {
159
+ "utterances": current_chunk,
160
+ "start": current_start,
161
+ "end": current_end,
162
+ }
163
+ )
164
+ current_chunk = [utterance]
165
+ current_start = utterance.start
166
+ current_end = utterance.end
167
+ else:
168
+ # Add to current chunk
169
+ current_chunk.append(utterance)
170
+ current_end = utterance.end
171
+
172
+ # Add final chunk
173
+ if current_chunk:
174
+ chunks.append(
175
+ {"utterances": current_chunk, "start": current_start, "end": current_end}
176
+ )
177
+
178
+ return chunks
179
+
180
+
181
+ def process_audio(audio_path):
182
+ """Main processing pipeline"""
183
+ print("Stage 1: Getting raw transcript from Deepgram...")
184
+ transcript_data = get_transcript(audio_path)
185
+
186
+ print("Stage 2: Processing in chunks...")
187
+ chunks = create_chunks(transcript_data)
188
+ original_chunks = []
189
+ enhanced_chunks = []
190
+
191
+ for i, chunk in enumerate(chunks):
192
+ # Get original chunk
193
+ chunk_text = format_transcript(chunk["utterances"])
194
+ original_chunks.append(chunk_text)
195
+
196
+ # Process enhanced version
197
+ print(f"Processing chunk {i+1} of {len(chunks)}...")
198
+ audio_segment = get_audio_segment(audio_path, chunk["start"], chunk["end"])
199
+ enhanced_chunk = enhance_transcript(chunk_text, audio_segment)
200
+ enhanced_chunks.append(enhanced_chunk)
201
+
202
+ return "\n".join(original_chunks), "\n".join(enhanced_chunks)
203
+
204
+
205
+ def handle_upload(audio):
206
+ """Handle Gradio interface uploads"""
207
+ if audio is None:
208
+ return "Please upload an audio file.", "Please upload an audio file."
209
+
210
+ try:
211
+ original, enhanced = process_audio(audio)
212
+ return original, enhanced
213
+ except Exception as e:
214
+ error_msg = f"Error processing audio: {str(e)}"
215
+ return error_msg, error_msg
216
+
217
+
218
+ def get_audio_segment(audio_path, start_time, end_time):
219
+ """Extract audio segment between start and end times"""
220
+ audio = AudioSegment.from_file(audio_path)
221
+ start_ms = int(float(start_time) * 1000)
222
+ end_ms = int(float(end_time) * 1000)
223
+ return audio[start_ms:end_ms].export(format="mp3")
224
+
225
+
226
+ # Create Gradio interface
227
+ iface = gr.Interface(
228
+ fn=handle_upload,
229
+ inputs=gr.Audio(type="filepath"),
230
+ outputs=[
231
+ gr.Textbox(label="Original Transcript"),
232
+ gr.Textbox(label="Enhanced Transcript"),
233
+ ],
234
+ title="Audio Transcript Enhancement",
235
+ description="Upload an MP3 file to get both the original and enhanced transcripts using Deepgram and Gemini.",
236
+ cache_examples=False,
237
+ )
238
+
239
+ if __name__ == "__main__":
240
+ iface.launch()