dwarkesh commited on
Commit
8db26e0
Β·
1 Parent(s): 8b25524

will this work

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
old/app.py β†’ app.py RENAMED
File without changes
old/.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
old/README.md DELETED
@@ -1,12 +0,0 @@
1
- ---
2
- title: Producer
3
- emoji: ⚑
4
- colorFrom: green
5
- colorTo: pink
6
- sdk: gradio
7
- sdk_version: 5.8.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
old/csv-to-yaml.py DELETED
@@ -1,105 +0,0 @@
1
- import pandas as pd
2
- import yaml
3
- from pathlib import Path
4
-
5
- def clean_text(text):
6
- if pd.isna(text):
7
- return ""
8
- return str(text).strip()
9
-
10
- def convert_timestamps(df):
11
- examples = []
12
- for row in df['Timestamps'].dropna():
13
- examples.append(clean_text(row))
14
- return {
15
- 'name': 'Timestamps Generator',
16
- 'description': 'Generates timestamps for key moments in podcast episodes',
17
- 'examples': examples
18
- }
19
-
20
- def convert_titles(df):
21
- examples = []
22
- for title in df['Titles'].dropna():
23
- examples.append(clean_text(title))
24
- return {
25
- 'name': 'Episode Titles',
26
- 'description': 'Collection of episode titles',
27
- 'examples': examples
28
- }
29
-
30
- def convert_descriptions(df):
31
- examples = []
32
- for _, row in df.iterrows():
33
- if pd.notna(row['Tweet Text']):
34
- examples.append({
35
- 'text': clean_text(row['Tweet Text']),
36
- 'link': clean_text(row.get('Link', ''))
37
- })
38
- return {
39
- 'name': 'Viral Episode Descriptions',
40
- 'description': 'Viral-worthy episode descriptions for Twitter',
41
- 'examples': examples
42
- }
43
-
44
- def convert_titles_thumbnails(df):
45
- examples = []
46
- for _, row in df.iterrows():
47
- if pd.notna(row['Titles']) and pd.notna(row['Thumbnail']):
48
- examples.append({
49
- 'title': clean_text(row['Titles']),
50
- 'thumbnail': clean_text(row['Thumbnail'])
51
- })
52
- return {
53
- 'name': 'Titles and Thumbnails',
54
- 'description': 'Title and thumbnail combinations for episodes',
55
- 'examples': examples
56
- }
57
-
58
- def convert_viral_clips(df):
59
- examples = []
60
- for _, row in df.iterrows():
61
- if pd.notna(row['Tweet Text']) and pd.notna(row['Clip Transcript']):
62
- example = {
63
- 'tweet': clean_text(row['Tweet Text']),
64
- 'transcript': clean_text(row['Clip Transcript'])
65
- }
66
- if pd.notna(row.get('Link')):
67
- example['link'] = clean_text(row['Link'])
68
- if pd.notna(row.get('Likes')):
69
- example['metrics'] = {
70
- 'likes': int(row['Likes']),
71
- 'reposts': int(row['Reposts']),
72
- 'quotes': int(row['Quotes'])
73
- }
74
- examples.append(example)
75
- return {
76
- 'name': 'Viral Clips',
77
- 'description': 'Collection of viral clips with engagement metrics',
78
- 'examples': examples
79
- }
80
-
81
- def main():
82
- # Create prompts directory
83
- prompts_dir = Path('../prompts')
84
- prompts_dir.mkdir(exist_ok=True)
85
- print(f"Created prompts directory at {prompts_dir}")
86
- # Convert each CSV
87
- conversions = {
88
- 'Timestamps.csv': (pd.read_csv('source/Timestamps.csv'), convert_timestamps),
89
- 'Titles.csv': (pd.read_csv('source/Titles.csv'), convert_titles),
90
- 'Viral Episode Descriptions.csv': (pd.read_csv('source/Viral Episode Descriptions.csv'), convert_descriptions),
91
- 'Titles & Thumbnails.csv': (pd.read_csv('source/Titles & Thumbnails.csv'), convert_titles_thumbnails),
92
- 'Viral Twitter Clips.csv': (pd.read_csv('source/Viral Twitter Clips.csv'), convert_viral_clips)
93
- }
94
-
95
- for filename, (df, converter) in conversions.items():
96
- output = converter(df)
97
- yaml_filename = prompts_dir / f"{filename.split('.')[0].lower().replace(' ', '_')}.yaml"
98
-
99
- with open(yaml_filename, 'w', encoding='utf-8') as f:
100
- yaml.dump(output, f, allow_unicode=True, sort_keys=False, width=1000)
101
-
102
- print(f"Converted {filename} to {yaml_filename}")
103
-
104
- if __name__ == "__main__":
105
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
old/generate-transcript.py DELETED
@@ -1,222 +0,0 @@
1
- from dataclasses import dataclass
2
- import os
3
- from typing import List, Optional, Dict
4
- import json
5
- from google import genai
6
- from google.genai.types import Tool, GenerateContentConfig, GoogleSearch
7
- import asyncio
8
- from deepgram import Deepgram
9
- import mimetypes
10
-
11
- @dataclass
12
- class TranscriptSegment:
13
- speaker: str
14
- text: str
15
- start: float
16
- end: float
17
-
18
- @dataclass
19
- class ProcessedChunk:
20
- segments: List[TranscriptSegment]
21
- original_text: str
22
- processed_text: Optional[str] = None
23
-
24
- class TranscriptProcessor:
25
- def __init__(self, max_tokens: int = 6000):
26
- """Initialize the TranscriptProcessor with API clients.
27
-
28
- Environment variables required:
29
- - GOOGLE_API_KEY: API key for Google Gemini
30
- - DEEPGRAM_API_KEY: API key for Deepgram
31
- """
32
- self.max_tokens = max_tokens
33
-
34
- # Get API keys from environment variables
35
- genai_api_key = os.getenv('GOOGLE_API_KEY')
36
- deepgram_api_key = os.getenv('DEEPGRAM_API_KEY')
37
-
38
- if not genai_api_key:
39
- raise ValueError("GOOGLE_API_KEY environment variable is not set")
40
- if not deepgram_api_key:
41
- raise ValueError("DEEPGRAM_API_KEY environment variable is not set")
42
-
43
- self.genai_client = genai.Client(api_key=genai_api_key)
44
- self.deepgram_client = Deepgram(deepgram_api_key)
45
-
46
- # Create search tool
47
- self.google_search_tool = Tool(
48
- google_search=GoogleSearch()
49
- )
50
-
51
- self.prompt_template = """
52
- Your task is to improve the readability of this transcript while maintaining its core meaning. Additionally, you should add relevant links to technical terms, products, papers, people, or concepts mentioned in the transcript.
53
-
54
- Examples of input and desired output:
55
-
56
- Input:
57
- "Speaker A: Yeah, so like, um, you know, I've been really diving into this whole, like, transformer architecture thing, right? And, um, what's really interesting is like, you know, how they handle this attention mechanism stuff. I mean, it's like, basically, you know, the way it processes sequential data is just, like, mind-blowing if you think about it. And, um, yeah, what I'm trying to say is that, like, the whole self-attention concept is just, you know, really revolutionary and stuff."
58
-
59
- Output:
60
- "Speaker A: I've been exploring the [transformer architecture](https://arxiv.org/abs/1706.03762), and it's fascinating how it implements attention mechanisms. The way it processes sequential data is revolutionary, particularly the [self-attention concept](https://distill.pub/2016/augmented-rnns/#attentional-interfaces) that fundamentally changed the field."
61
-
62
- Input:
63
- "Speaker B: Yeah, yeah, totally, and like, you know what's really cool is that, um, I've been working with PyTorch for this project I'm doing, and like, you know, implementing BERT has been super helpful because, um, you know, it's like pre-trained and stuff, and I mean, the whole masked language modeling thing is just, like, really powerful, you know what I mean? And then, like, there's all these other models like GPT and RoBERTa that kind of like, you know, built on top of it and made things even better, if that makes sense."
64
-
65
- Output:
66
- "Speaker B: I've been working with [PyTorch](https://pytorch.org/) on my project, implementing [BERT](https://arxiv.org/abs/1810.04805) with its pre-trained capabilities. The masked language modeling approach has proven powerful, leading to advancements like [GPT](https://arxiv.org/abs/2005.14165) and [RoBERTa](https://arxiv.org/abs/1907.11692)."
67
-
68
- Input:
69
- "Speaker A: Right, right, and you know what's really interesting is like, um, when you look at the training data requirements and stuff, it's like, you know, these large language models need just a massive amount of, like, compute and data to train properly, and I mean, that's why, you know, we're seeing all these different approaches to trying to make it more efficient, like, um, you know, quantization and pruning and stuff like that. And like, yeah, I think that's why the whole debate about compute requirements is getting so much attention nowadays."
70
-
71
- Output:
72
- "Speaker A: The training requirements for large language models are substantial, demanding extensive compute resources and data. This has led to efficiency innovations like [quantization](https://arxiv.org/abs/2103.13630) and [pruning](https://arxiv.org/abs/2010.13103), sparking important discussions about computational sustainability in AI."
73
-
74
- Instructions:
75
- * Make text highly readable while maintaining the core message and natural speech patterns
76
- * Remove or consolidate:
77
- - Filler words (um, uh, you know, like, I mean)
78
- - False starts and self-corrections
79
- - Repeated phrases and stutters
80
- - Verbal tics and unnecessary interjections
81
- * Improve sentence structure
82
- * Format for clarity
83
- * Enhance coherence
84
- * Preserve speaker labels and technical terms exactly
85
- * Add Markdown-style links to:
86
- - Technical terms and concepts
87
- - Products and tools mentioned
88
- - Research papers or articles referenced
89
- - Notable people or organizations
90
- - Only add links when you're confident about the reference
91
- - Prioritize official documentation, papers, or authoritative sources
92
- - For papers, prefer arXiv links when available
93
-
94
- Here's the transcript to process:
95
- {text}
96
- """
97
-
98
- async def transcribe_audio(self, audio_path: str) -> List[TranscriptSegment]:
99
- """Transcribe audio using Deepgram with automatic format detection"""
100
- # Get the mime type of the audio file
101
- mime_type = mimetypes.guess_type(audio_path)[0]
102
- if not mime_type:
103
- # Default to mp3 if we can't detect the type
104
- mime_type = 'audio/mpeg' if audio_path.lower().endswith('.mp3') else 'audio/wav'
105
-
106
- with open(audio_path, 'rb') as audio:
107
- source = {'buffer': audio, 'mimetype': mime_type}
108
- response = await self.deepgram_client.transcription.prerecorded(
109
- source,
110
- {
111
- 'smart_format': True,
112
- 'punctuate': True,
113
- 'diarize': True,
114
- 'utterances': True
115
- }
116
- )
117
-
118
- segments = []
119
- for utterance in response['results']['utterances']:
120
- segments.append(TranscriptSegment(
121
- speaker=f"Speaker {utterance['speaker']}",
122
- text=utterance['transcript'],
123
- start=utterance['start'],
124
- end=utterance['end']
125
- ))
126
-
127
- return segments
128
-
129
- def create_chunks(self, segments: List[TranscriptSegment]) -> List[ProcessedChunk]:
130
- chunks = []
131
- current_segments = []
132
- current_text = ""
133
-
134
- for segment in segments:
135
- segment_text = f"{segment.speaker}: {segment.text}\n"
136
- potential_text = current_text + segment_text
137
- potential_prompt = self.prompt_template.format(text=potential_text)
138
-
139
- if len(potential_prompt.split()) > (self.max_tokens * 0.75) and current_segments:
140
- chunks.append(ProcessedChunk(
141
- segments=current_segments,
142
- original_text=current_text
143
- ))
144
- current_segments = []
145
- current_text = ""
146
-
147
- current_segments.append(segment)
148
- current_text += segment_text
149
-
150
- if current_segments:
151
- chunks.append(ProcessedChunk(
152
- segments=current_segments,
153
- original_text=current_text
154
- ))
155
-
156
- return chunks
157
-
158
- async def process_chunk(self, chunk: ProcessedChunk) -> None:
159
- """Process a chunk using Gemini 2.0 Flash with Search enabled"""
160
- prompt = self.prompt_template.format(text=chunk.original_text)
161
-
162
- response = await self.genai_client.models.generate_content(
163
- model='gemini-2.0-flash-exp',
164
- contents=prompt,
165
- config=GenerateContentConfig(
166
- tools=[self.google_search_tool],
167
- response_modalities=["TEXT"],
168
- )
169
- )
170
-
171
- # Get the main response text
172
- chunk.processed_text = ""
173
- for part in response.candidates[0].content.parts:
174
- chunk.processed_text += part.text
175
-
176
- # Log the search metadata for debugging/verification
177
- if hasattr(response.candidates[0], 'grounding_metadata') and \
178
- hasattr(response.candidates[0].grounding_metadata, 'search_entry_point'):
179
- print(f"Search metadata found for chunk: {response.candidates[0].grounding_metadata.search_entry_point.rendered_content}")
180
-
181
- async def process_transcript(self, audio_path: str) -> str:
182
- """Main processing pipeline"""
183
- # Transcribe audio
184
- segments = await self.transcribe_audio(audio_path)
185
-
186
- # Create chunks
187
- chunks = self.create_chunks(segments)
188
-
189
- # Process each chunk with retries
190
- async def process_with_retry(chunk, max_retries=3):
191
- for attempt in range(max_retries):
192
- try:
193
- await self.process_chunk(chunk)
194
- return
195
- except Exception as e:
196
- if attempt == max_retries - 1:
197
- print(f"Failed to process chunk after {max_retries} attempts: {e}")
198
- raise
199
- await asyncio.sleep(1 * (attempt + 1)) # Exponential backoff
200
-
201
- # Process chunks in parallel with retries
202
- tasks = [process_with_retry(chunk) for chunk in chunks]
203
- await asyncio.gather(*tasks)
204
-
205
- # Combine processed chunks
206
- final_text = "\n".join(chunk.processed_text for chunk in chunks if chunk.processed_text)
207
-
208
- return final_text
209
-
210
-
211
- async def main():
212
- # Make sure to set these environment variables before running:
213
- # export GOOGLE_API_KEY="your_google_api_key"
214
- # export DEEPGRAM_API_KEY="your_deepgram_api_key"
215
- processor = TranscriptProcessor()
216
-
217
- # Example usage with either MP3 or WAV file
218
- final_transcript = await processor.process_transcript("audio.mp3")
219
-
220
- # Save as markdown file
221
- with open("transcript.md", "w") as f:
222
- f.write(final_transcript)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
old/prompts.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "clips": "You are a social media expert for the Dwarkesh Podcast. Generate 10 viral-worthy clips from the transcript.\nFormat as:\nTweet 1\nTweet Text: [text]\nClip Transcript: [45-120 seconds of transcript]\n\nPrevious examples:\n{clips_examples}",
3
- "description": "Create an engaging episode description tweet (280 chars max) that:\n1. Highlights compelling aspects\n2. Includes topic areas and handles\n3. Ends with \"Links below\" or \"Enjoy!\"\n\nPrevious examples:\n{description_examples}",
4
- "timestamps": "Generate timestamps (HH:MM:SS) every 3-8 minutes covering key transitions and moments.\nUse 2-6 word descriptions.\nStart at 00:00:00.\n\nPrevious examples:\n{timestamps_examples}",
5
- "titles_and_thumbnails": "Create 3-5 compelling title-thumbnail combinations that tell a story.\n\nTitle Format: \"Guest Name \u2013 Key Story or Core Insight\"\nThumbnail: 2-4 ALL CAPS words that create intrigue with the title\n\nExample: \"David Reich \u2013 How One Small Tribe Conquered the World 70,000 Years Ago\"\nThumbnail: \"LAST HUMANS STANDING\"\n\nThe combination should create intellectual curiosity without clickbait.\n\nPrevious examples:\n{titles_and_thumbnails_examples}"
6
- }
 
 
 
 
 
 
 
old/requirements.txt DELETED
@@ -1,4 +0,0 @@
1
- gradio>=4.0.0
2
- anthropic>=0.7.0
3
- pandas>=2.0.0
4
- youtube-transcript-api>=0.6.1
 
 
 
 
 
old/source/Titles.csv DELETED
@@ -1,12 +0,0 @@
1
- Titles
2
- "Sarah C. M. Paine - WW2, Taiwan, Ukraine, & Maritime vs Continental Powers"
3
- "David Reich – How One Small Tribe Conquered the World 70,000 Years Ago"
4
- "Mark Zuckerberg - Llama 3, $10B Models, Caesar Augustus, & 1 GW Datacenters"
5
- "Ilya Sutskever (OpenAI Chief Scientist) - Building AGI, Alignment, Spies, Microsoft, & Enlightenment"
6
- "Grant Sanderson ( @3blue1brown ) - Past, Present, & Future of Mathematics"
7
- Daniel Yergin – Oil Explains the Entire 20th Century
8
- "Leopold Aschenbrenner - 2027 AGI, China/US Super-Intelligence Race, & The Return of History"
9
- "Dario Amodei (Anthropic CEO) - $10 Billion Models, OpenAI, Scaling, & Alignment"
10
- @Asianometry & Dylan Patel – How the Semiconductor Industry Actually Works
11
- "Demis Hassabis – Scaling, Superhuman AIs, AlphaZero atop LLMs, AlphaFold"
12
- Sholto Douglas & Trenton Bricken - How to Build & Understand GPT-7's Mind
 
 
 
 
 
 
 
 
 
 
 
 
 
source/.DS_Store ADDED
Binary file (6.15 kB). View file
 
{old/source β†’ source}/Timestamps.csv RENAMED
File without changes
{old/source β†’ source}/Titles & Thumbnails.csv RENAMED
File without changes
{old/source β†’ source}/Viral Episode Descriptions.csv RENAMED
File without changes
{old/source β†’ source}/Viral Twitter Clips.csv RENAMED
File without changes