kinda works
Browse files- transcript.py +24 -51
transcript.py
CHANGED
@@ -10,7 +10,7 @@ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
|
10 |
|
11 |
dg_client = DeepgramClient(DEEPGRAM_API_KEY)
|
12 |
generativeai.configure(api_key=GOOGLE_API_KEY)
|
13 |
-
model = generativeai.GenerativeModel("gemini-
|
14 |
|
15 |
|
16 |
def format_timestamp(seconds):
|
@@ -74,66 +74,39 @@ def format_transcript(utterances):
|
|
74 |
|
75 |
def enhance_transcript(chunk_text, audio_segment):
|
76 |
"""Enhance transcript using Gemini AI with both text and audio"""
|
77 |
-
prompt = """
|
78 |
|
79 |
-
|
80 |
-
1. Correct transcription errors using the audio
|
81 |
-
2. Format for readability:
|
82 |
-
- Remove filler words (e.g., "um", "like", "you know")
|
83 |
-
- Remove repetitions and false starts
|
84 |
-
- Break into clear paragraphs
|
85 |
-
- Add punctuation and quotation marks
|
86 |
-
3. Maintain exact speaker names and timestamps
|
87 |
-
4. Fix speaker attribution errors by:
|
88 |
-
- Using the audio to verify who is actually speaking
|
89 |
-
- Moving text to the correct speaker's section if misattributed
|
90 |
-
- Never combining multiple speakers' text into one section
|
91 |
-
- These often happen at the end of a speaker's section or the beginning of the next speaker's section. Be aware of this!
|
92 |
|
93 |
-
|
|
|
94 |
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
100 |
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
|
107 |
-
|
|
|
|
|
108 |
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
Let's go to World War I and World War II. A couple months ago, I interviewed the biographer of Churchill, Andrew Roberts. As you discuss in your book, he discusses that Churchill was this sort of technological visionary and how that's a side of him that isn't talked about often. Maybe talk a little bit about what Churchill did and how he saw the power of oil.
|
113 |
-
|
114 |
-
Daniel Yergin 00:14:04
|
115 |
-
|
116 |
-
Churchill was the First Lord of the Admiralty. All the naval ships at that time ran on coal, which means you had to have people on board shoveling coal. It took a long time to get the coal on board. If you switched to oil, the ships would be faster. They wouldn't need to take the same time. They wouldn't need to carry the same people.
|
117 |
-
|
118 |
-
So he made the decision—obviously others like Admiral Jackie Fisher were pushing him—to convert the Royal Navy to oil. People were saying this is treacherous because we'll depend upon oil from far away, from Persia, rather than Welsh coal. He said, "This is the prize of the venture." That's where I got my title from. Originally it was going to be called "The Prize of the Venture" because that's what he said. Then I just made it The Prize.
|
119 |
-
|
120 |
-
During World War I, he promoted another military development. I'm forgetting what it was called initially, but it eventually became known as the tank. He really did constantly push technology. Why? I don't know. He was not educated like that. He was educated in the classic sense. That's why he wrote so well. But he understood technology and that you had to constantly push for advantage.
|
121 |
-
|
122 |
-
</Enhanced>
|
123 |
-
|
124 |
-
Notice how the enhanced version:
|
125 |
-
1. Maintains exact speaker names and timestamps
|
126 |
-
2. Removes filler words and repetitions
|
127 |
-
3. Breaks long passages into logical paragraphs
|
128 |
-
4. Adds proper punctuation and quotation marks
|
129 |
-
6. Corrects speaker attribution errors.
|
130 |
-
|
131 |
-
Output only the enhanced transcript, maintaining speaker names and timestamps exactly as given.
|
132 |
|
|
|
133 |
"""
|
134 |
|
135 |
response = model.generate_content(
|
136 |
-
[prompt,
|
137 |
)
|
138 |
return response.text
|
139 |
|
|
|
10 |
|
11 |
dg_client = DeepgramClient(DEEPGRAM_API_KEY)
|
12 |
generativeai.configure(api_key=GOOGLE_API_KEY)
|
13 |
+
model = generativeai.GenerativeModel("gemini-exp-1206")
|
14 |
|
15 |
|
16 |
def format_timestamp(seconds):
|
|
|
74 |
|
75 |
def enhance_transcript(chunk_text, audio_segment):
|
76 |
"""Enhance transcript using Gemini AI with both text and audio"""
|
77 |
+
prompt = """You are an expert transcript editor. Your task is to enhance this transcript for maximum readability while maintaining the core message.
|
78 |
|
79 |
+
IMPORTANT: Respond ONLY with the enhanced transcript. Do not include any explanations, headers, or phrases like "Here is the transcript."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
+
Please:
|
82 |
+
1. Fix speaker attribution errors, especially at segment boundaries. Watch for incomplete thoughts that were likely from the previous speaker.
|
83 |
|
84 |
+
2. Optimize for readability over verbatim accuracy:
|
85 |
+
- Remove filler words (um, uh, like, you know)
|
86 |
+
- Eliminate false starts and repetitions
|
87 |
+
- Convert rambling sentences into clear, concise statements
|
88 |
+
- Break up run-on sentences into shorter ones
|
89 |
+
- Maintain natural conversation flow while improving clarity
|
90 |
|
91 |
+
3. Format the output consistently:
|
92 |
+
- Keep the "Speaker X [timestamp]" format
|
93 |
+
- Use proper punctuation and capitalization
|
94 |
+
- Add paragraph breaks for topic changes
|
95 |
+
- Preserve distinct speaker turns
|
96 |
|
97 |
+
Example input:
|
98 |
+
Speaker 1 00:01:15
|
99 |
+
Um, yeah, so like, what I was thinking was, you know, when we look at the data, the data shows us that, uh, there's this pattern, this pattern that keeps coming up again and again in the results.
|
100 |
|
101 |
+
Example output:
|
102 |
+
Speaker 1 00:01:15
|
103 |
+
When we look at the data, we see a consistent pattern in the results.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
+
Enhance the following transcript, starting directly with the speaker format:
|
106 |
"""
|
107 |
|
108 |
response = model.generate_content(
|
109 |
+
[prompt, {"mime_type": "audio/mp3", "data": audio_segment.read()}]
|
110 |
)
|
111 |
return response.text
|
112 |
|