prompt is even better
Browse files- .gitignore +2 -0
- transcript.py +72 -49
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
transcript.md
|
2 |
+
autogenerated-transcript.md
|
transcript.py
CHANGED
@@ -1,8 +1,12 @@
|
|
1 |
-
import
|
2 |
import assemblyai as aai
|
3 |
from google import generativeai
|
4 |
import os
|
5 |
from pydub import AudioSegment
|
|
|
|
|
|
|
|
|
6 |
|
7 |
# Initialize API clients
|
8 |
ASSEMBLYAI_API_KEY = os.getenv("ASSEMBLYAI_API_KEY")
|
@@ -78,12 +82,15 @@ Note: Below you'll find an auto-generated transcript that may help with speaker
|
|
78 |
Please:
|
79 |
1. Fix speaker attribution errors, especially at segment boundaries. Watch for incomplete thoughts that were likely from the previous speaker.
|
80 |
|
81 |
-
2. Optimize for readability over verbatim accuracy:
|
82 |
-
-
|
83 |
-
-
|
84 |
-
-
|
85 |
-
-
|
86 |
-
-
|
|
|
|
|
|
|
87 |
|
88 |
3. Format the output consistently:
|
89 |
- Keep the "Speaker X 00:00:00" format (no brackets, no other formatting)
|
@@ -103,7 +110,7 @@ Speaker A 00:01:15
|
|
103 |
|
104 |
When we look at the data, we see a consistent pattern in the results.
|
105 |
|
106 |
-
|
107 |
|
108 |
Enhance the following transcript, starting directly with the speaker format:
|
109 |
"""
|
@@ -114,7 +121,7 @@ Enhance the following transcript, starting directly with the speaker format:
|
|
114 |
return response.text
|
115 |
|
116 |
|
117 |
-
def create_chunks(utterances, target_tokens=
|
118 |
"""Create chunks of utterances that fit within token limits"""
|
119 |
chunks = []
|
120 |
current_chunk = []
|
@@ -156,6 +163,14 @@ def create_chunks(utterances, target_tokens=7500):
|
|
156 |
return chunks
|
157 |
|
158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
def process_audio(audio_path):
|
160 |
"""Main processing pipeline"""
|
161 |
print("Stage 1: Getting raw transcript from AssemblyAI...")
|
@@ -163,34 +178,43 @@ def process_audio(audio_path):
|
|
163 |
|
164 |
print("Stage 2: Processing in chunks...")
|
165 |
chunks = create_chunks(transcript_data)
|
166 |
-
original_chunks = []
|
167 |
-
enhanced_chunks = []
|
168 |
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
original_chunks.append(chunk_text)
|
173 |
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
enhanced_chunk = enhance_transcript(chunk_text, audio_segment)
|
178 |
-
enhanced_chunks.append(enhanced_chunk)
|
179 |
|
180 |
-
|
|
|
|
|
|
|
|
|
|
|
181 |
|
|
|
|
|
182 |
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
|
|
187 |
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
|
195 |
|
196 |
def get_audio_segment(audio_path, start_time, end_time):
|
@@ -201,23 +225,22 @@ def get_audio_segment(audio_path, start_time, end_time):
|
|
201 |
return audio[start_ms:end_ms].export(format="mp3")
|
202 |
|
203 |
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
)
|
221 |
|
222 |
if __name__ == "__main__":
|
223 |
-
|
|
|
1 |
+
import argparse
|
2 |
import assemblyai as aai
|
3 |
from google import generativeai
|
4 |
import os
|
5 |
from pydub import AudioSegment
|
6 |
+
import concurrent.futures
|
7 |
+
|
8 |
+
# Suppress gRPC shutdown warnings
|
9 |
+
os.environ["GRPC_PYTHON_LOG_LEVEL"] = "error"
|
10 |
|
11 |
# Initialize API clients
|
12 |
ASSEMBLYAI_API_KEY = os.getenv("ASSEMBLYAI_API_KEY")
|
|
|
82 |
Please:
|
83 |
1. Fix speaker attribution errors, especially at segment boundaries. Watch for incomplete thoughts that were likely from the previous speaker.
|
84 |
|
85 |
+
2. Optimize AGGRESSIVELY for readability over verbatim accuracy:
|
86 |
+
- Readability is the most important thing!!
|
87 |
+
- Remove ALL conversational artifacts (yeah, so, I mean, etc.)
|
88 |
+
- Remove ALL filler words (um, uh, like, you know)
|
89 |
+
- Remove false starts and self-corrections completely
|
90 |
+
- Remove redundant phrases and hesitations
|
91 |
+
- Convert any indirect or rambling responses into direct statements
|
92 |
+
- Break up run-on sentences into clear, concise statements
|
93 |
+
- Maintain natural conversation flow while prioritizing clarity and directness
|
94 |
|
95 |
3. Format the output consistently:
|
96 |
- Keep the "Speaker X 00:00:00" format (no brackets, no other formatting)
|
|
|
110 |
|
111 |
When we look at the data, we see a consistent pattern in the results.
|
112 |
|
113 |
+
When we examine the second part of the analysis, it reveals a completely different finding.
|
114 |
|
115 |
Enhance the following transcript, starting directly with the speaker format:
|
116 |
"""
|
|
|
121 |
return response.text
|
122 |
|
123 |
|
124 |
+
def create_chunks(utterances, target_tokens=2000):
|
125 |
"""Create chunks of utterances that fit within token limits"""
|
126 |
chunks = []
|
127 |
current_chunk = []
|
|
|
163 |
return chunks
|
164 |
|
165 |
|
166 |
+
def process_chunk(chunk_data):
|
167 |
+
"""Process a single chunk with Gemini"""
|
168 |
+
audio_path, chunk = chunk_data
|
169 |
+
chunk_text = format_transcript(chunk["utterances"])
|
170 |
+
audio_segment = get_audio_segment(audio_path, chunk["start"], chunk["end"])
|
171 |
+
return enhance_transcript(chunk_text, audio_segment)
|
172 |
+
|
173 |
+
|
174 |
def process_audio(audio_path):
|
175 |
"""Main processing pipeline"""
|
176 |
print("Stage 1: Getting raw transcript from AssemblyAI...")
|
|
|
178 |
|
179 |
print("Stage 2: Processing in chunks...")
|
180 |
chunks = create_chunks(transcript_data)
|
|
|
|
|
181 |
|
182 |
+
# Get original transcript
|
183 |
+
original_chunks = [format_transcript(chunk["utterances"]) for chunk in chunks]
|
184 |
+
original_transcript = "\n".join(original_chunks)
|
|
|
185 |
|
186 |
+
# Process enhanced versions in parallel
|
187 |
+
print(f"Stage 3: Enhancing {len(chunks)} chunks in parallel...")
|
188 |
+
chunk_data = [(audio_path, chunk) for chunk in chunks]
|
|
|
|
|
189 |
|
190 |
+
# Use max_workers=None to allow as many threads as needed
|
191 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=None) as executor:
|
192 |
+
# Submit all tasks and store with their original indices
|
193 |
+
future_to_index = {
|
194 |
+
executor.submit(process_chunk, data): i for i, data in enumerate(chunk_data)
|
195 |
+
}
|
196 |
|
197 |
+
# Create a list to store results in order
|
198 |
+
enhanced_chunks = [None] * len(chunks)
|
199 |
|
200 |
+
# Process results as they complete
|
201 |
+
for future in concurrent.futures.as_completed(future_to_index):
|
202 |
+
index = future_to_index[future]
|
203 |
+
print(f"Completed chunk {index + 1}/{len(chunks)}")
|
204 |
+
enhanced_chunks[index] = future.result()
|
205 |
|
206 |
+
enhanced_transcript = "\n".join(enhanced_chunks)
|
207 |
+
|
208 |
+
# Write transcripts to files
|
209 |
+
with open("autogenerated-transcript.md", "w", encoding="utf-8") as f:
|
210 |
+
f.write(original_transcript)
|
211 |
+
|
212 |
+
with open("transcript.md", "w", encoding="utf-8") as f:
|
213 |
+
f.write(enhanced_transcript)
|
214 |
+
|
215 |
+
print("\nTranscripts have been saved to:")
|
216 |
+
print("- autogenerated-transcript.md")
|
217 |
+
print("- transcript.md")
|
218 |
|
219 |
|
220 |
def get_audio_segment(audio_path, start_time, end_time):
|
|
|
225 |
return audio[start_ms:end_ms].export(format="mp3")
|
226 |
|
227 |
|
228 |
+
def main():
|
229 |
+
parser = argparse.ArgumentParser(
|
230 |
+
description="Generate enhanced transcripts from audio files"
|
231 |
+
)
|
232 |
+
parser.add_argument("audio_file", help="Path to the audio file to transcribe")
|
233 |
+
args = parser.parse_args()
|
234 |
+
|
235 |
+
if not os.path.exists(args.audio_file):
|
236 |
+
print(f"Error: File '{args.audio_file}' not found")
|
237 |
+
return
|
238 |
+
|
239 |
+
try:
|
240 |
+
process_audio(args.audio_file)
|
241 |
+
except Exception as e:
|
242 |
+
print(f"Error processing audio: {str(e)}")
|
243 |
+
|
|
|
244 |
|
245 |
if __name__ == "__main__":
|
246 |
+
main()
|