Spaces:

dwarkesh
/

producer

Running

App Files Files Community

dwarkesh commited on Dec 22, 2024

Commit

8db26e0

1 Parent(s): 8b25524

will this work

Browse files

Files changed (14) hide show

.DS_Store +0 -0
old/app.py → app.py +0 -0
old/.gitattributes +0 -35
old/README.md +0 -12
old/csv-to-yaml.py +0 -105
old/generate-transcript.py +0 -222
old/prompts.json +0 -6
old/requirements.txt +0 -4
old/source/Titles.csv +0 -12
source/.DS_Store +0 -0
{old/source → source}/Timestamps.csv +0 -0
{old/source → source}/Titles & Thumbnails.csv +0 -0
{old/source → source}/Viral Episode Descriptions.csv +0 -0
{old/source → source}/Viral Twitter Clips.csv +0 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

old/app.py → app.py RENAMED Viewed

File without changes

old/.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

old/README.md DELETED Viewed

@@ -1,12 +0,0 @@
----
-title: Producer
-emoji: ⚡
-colorFrom: green
-colorTo: pink
-sdk: gradio
-sdk_version: 5.8.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

old/csv-to-yaml.py DELETED Viewed

@@ -1,105 +0,0 @@
-import pandas as pd
-import yaml
-from pathlib import Path
-def clean_text(text):
-    if pd.isna(text):
-        return ""
-    return str(text).strip()
-def convert_timestamps(df):
-    examples = []
-    for row in df['Timestamps'].dropna():
-        examples.append(clean_text(row))
-    return {
-        'name': 'Timestamps Generator',
-        'description': 'Generates timestamps for key moments in podcast episodes',
-        'examples': examples
-    }
-def convert_titles(df):
-    examples = []
-    for title in df['Titles'].dropna():
-        examples.append(clean_text(title))
-    return {
-        'name': 'Episode Titles',
-        'description': 'Collection of episode titles',
-        'examples': examples
-    }
-def convert_descriptions(df):
-    examples = []
-    for _, row in df.iterrows():
-        if pd.notna(row['Tweet Text']):
-            examples.append({
-                'text': clean_text(row['Tweet Text']),
-                'link': clean_text(row.get('Link', ''))
-            })
-    return {
-        'name': 'Viral Episode Descriptions',
-        'description': 'Viral-worthy episode descriptions for Twitter',
-        'examples': examples
-    }
-def convert_titles_thumbnails(df):
-    examples = []
-    for _, row in df.iterrows():
-        if pd.notna(row['Titles']) and pd.notna(row['Thumbnail']):
-            examples.append({
-                'title': clean_text(row['Titles']),
-                'thumbnail': clean_text(row['Thumbnail'])
-            })
-    return {
-        'name': 'Titles and Thumbnails',
-        'description': 'Title and thumbnail combinations for episodes',
-        'examples': examples
-    }
-def convert_viral_clips(df):
-    examples = []
-    for _, row in df.iterrows():
-        if pd.notna(row['Tweet Text']) and pd.notna(row['Clip Transcript']):
-            example = {
-                'tweet': clean_text(row['Tweet Text']),
-                'transcript': clean_text(row['Clip Transcript'])
-            }
-            if pd.notna(row.get('Link')):
-                example['link'] = clean_text(row['Link'])
-            if pd.notna(row.get('Likes')):
-                example['metrics'] = {
-                    'likes': int(row['Likes']),
-                    'reposts': int(row['Reposts']),
-                    'quotes': int(row['Quotes'])
-                }
-            examples.append(example)
-    return {
-        'name': 'Viral Clips',
-        'description': 'Collection of viral clips with engagement metrics',
-        'examples': examples
-    }
-def main():
-    # Create prompts directory
-    prompts_dir = Path('../prompts')
-    prompts_dir.mkdir(exist_ok=True)
-    print(f"Created prompts directory at {prompts_dir}")
-    # Convert each CSV
-    conversions = {
-        'Timestamps.csv': (pd.read_csv('source/Timestamps.csv'), convert_timestamps),
-        'Titles.csv': (pd.read_csv('source/Titles.csv'), convert_titles),
-        'Viral Episode Descriptions.csv': (pd.read_csv('source/Viral Episode Descriptions.csv'), convert_descriptions),
-        'Titles & Thumbnails.csv': (pd.read_csv('source/Titles & Thumbnails.csv'), convert_titles_thumbnails),
-        'Viral Twitter Clips.csv': (pd.read_csv('source/Viral Twitter Clips.csv'), convert_viral_clips)
-    }
-    for filename, (df, converter) in conversions.items():
-        output = converter(df)
-        yaml_filename = prompts_dir / f"{filename.split('.')[0].lower().replace(' ', '_')}.yaml"
-        with open(yaml_filename, 'w', encoding='utf-8') as f:
-            yaml.dump(output, f, allow_unicode=True, sort_keys=False, width=1000)
-        print(f"Converted {filename} to {yaml_filename}")
-if __name__ == "__main__":
-    main()

old/generate-transcript.py DELETED Viewed

@@ -1,222 +0,0 @@
-from dataclasses import dataclass
-import os
-from typing import List, Optional, Dict
-import json
-from google import genai
-from google.genai.types import Tool, GenerateContentConfig, GoogleSearch
-import asyncio
-from deepgram import Deepgram
-import mimetypes
-@dataclass
-class TranscriptSegment:
-    speaker: str
-    text: str
-    start: float
-    end: float
-@dataclass
-class ProcessedChunk:
-    segments: List[TranscriptSegment]
-    original_text: str
-    processed_text: Optional[str] = None
-class TranscriptProcessor:
-    def __init__(self, max_tokens: int = 6000):
-        """Initialize the TranscriptProcessor with API clients.
-        Environment variables required:
-        - GOOGLE_API_KEY: API key for Google Gemini
-        - DEEPGRAM_API_KEY: API key for Deepgram
-        """
-        self.max_tokens = max_tokens
-        # Get API keys from environment variables
-        genai_api_key = os.getenv('GOOGLE_API_KEY')
-        deepgram_api_key = os.getenv('DEEPGRAM_API_KEY')
-        if not genai_api_key:
-            raise ValueError("GOOGLE_API_KEY environment variable is not set")
-        if not deepgram_api_key:
-            raise ValueError("DEEPGRAM_API_KEY environment variable is not set")
-        self.genai_client = genai.Client(api_key=genai_api_key)
-        self.deepgram_client = Deepgram(deepgram_api_key)
-        # Create search tool
-        self.google_search_tool = Tool(
-            google_search=GoogleSearch()
-        )
-        self.prompt_template = """
-Your task is to improve the readability of this transcript while maintaining its core meaning. Additionally, you should add relevant links to technical terms, products, papers, people, or concepts mentioned in the transcript.
-Examples of input and desired output:
-Input:
-"Speaker A: Yeah, so like, um, you know, I've been really diving into this whole, like, transformer architecture thing, right? And, um, what's really interesting is like, you know, how they handle this attention mechanism stuff. I mean, it's like, basically, you know, the way it processes sequential data is just, like, mind-blowing if you think about it. And, um, yeah, what I'm trying to say is that, like, the whole self-attention concept is just, you know, really revolutionary and stuff."
-Output:
-"Speaker A: I've been exploring the [transformer architecture](https://arxiv.org/abs/1706.03762), and it's fascinating how it implements attention mechanisms. The way it processes sequential data is revolutionary, particularly the [self-attention concept](https://distill.pub/2016/augmented-rnns/#attentional-interfaces) that fundamentally changed the field."
-Input:
-"Speaker B: Yeah, yeah, totally, and like, you know what's really cool is that, um, I've been working with PyTorch for this project I'm doing, and like, you know, implementing BERT has been super helpful because, um, you know, it's like pre-trained and stuff, and I mean, the whole masked language modeling thing is just, like, really powerful, you know what I mean? And then, like, there's all these other models like GPT and RoBERTa that kind of like, you know, built on top of it and made things even better, if that makes sense."
-Output:
-"Speaker B: I've been working with [PyTorch](https://pytorch.org/) on my project, implementing [BERT](https://arxiv.org/abs/1810.04805) with its pre-trained capabilities. The masked language modeling approach has proven powerful, leading to advancements like [GPT](https://arxiv.org/abs/2005.14165) and [RoBERTa](https://arxiv.org/abs/1907.11692)."
-Input:
-"Speaker A: Right, right, and you know what's really interesting is like, um, when you look at the training data requirements and stuff, it's like, you know, these large language models need just a massive amount of, like, compute and data to train properly, and I mean, that's why, you know, we're seeing all these different approaches to trying to make it more efficient, like, um, you know, quantization and pruning and stuff like that. And like, yeah, I think that's why the whole debate about compute requirements is getting so much attention nowadays."
-Output:
-"Speaker A: The training requirements for large language models are substantial, demanding extensive compute resources and data. This has led to efficiency innovations like [quantization](https://arxiv.org/abs/2103.13630) and [pruning](https://arxiv.org/abs/2010.13103), sparking important discussions about computational sustainability in AI."
-Instructions:
-* Make text highly readable while maintaining the core message and natural speech patterns
-* Remove or consolidate:
-  - Filler words (um, uh, you know, like, I mean)
-  - False starts and self-corrections
-  - Repeated phrases and stutters
-  - Verbal tics and unnecessary interjections
-* Improve sentence structure
-* Format for clarity
-* Enhance coherence
-* Preserve speaker labels and technical terms exactly
-* Add Markdown-style links to:
-  - Technical terms and concepts
-  - Products and tools mentioned
-  - Research papers or articles referenced
-  - Notable people or organizations
-  - Only add links when you're confident about the reference
-  - Prioritize official documentation, papers, or authoritative sources
-  - For papers, prefer arXiv links when available
-Here's the transcript to process:
-{text}
-"""
-    async def transcribe_audio(self, audio_path: str) -> List[TranscriptSegment]:
-        """Transcribe audio using Deepgram with automatic format detection"""
-        # Get the mime type of the audio file
-        mime_type = mimetypes.guess_type(audio_path)[0]
-        if not mime_type:
-            # Default to mp3 if we can't detect the type
-            mime_type = 'audio/mpeg' if audio_path.lower().endswith('.mp3') else 'audio/wav'
-        with open(audio_path, 'rb') as audio:
-            source = {'buffer': audio, 'mimetype': mime_type}
-            response = await self.deepgram_client.transcription.prerecorded(
-                source,
-                {
-                    'smart_format': True,
-                    'punctuate': True,
-                    'diarize': True,
-                    'utterances': True
-                }
-            )
-        segments = []
-        for utterance in response['results']['utterances']:
-            segments.append(TranscriptSegment(
-                speaker=f"Speaker {utterance['speaker']}",
-                text=utterance['transcript'],
-                start=utterance['start'],
-                end=utterance['end']
-            ))
-        return segments
-    def create_chunks(self, segments: List[TranscriptSegment]) -> List[ProcessedChunk]:
-        chunks = []
-        current_segments = []
-        current_text = ""
-        for segment in segments:
-            segment_text = f"{segment.speaker}: {segment.text}\n"
-            potential_text = current_text + segment_text
-            potential_prompt = self.prompt_template.format(text=potential_text)
-            if len(potential_prompt.split()) > (self.max_tokens * 0.75) and current_segments:
-                chunks.append(ProcessedChunk(
-                    segments=current_segments,
-                    original_text=current_text
-                ))
-                current_segments = []
-                current_text = ""
-            current_segments.append(segment)
-            current_text += segment_text
-        if current_segments:
-            chunks.append(ProcessedChunk(
-                segments=current_segments,
-                original_text=current_text
-            ))
-        return chunks
-    async def process_chunk(self, chunk: ProcessedChunk) -> None:
-        """Process a chunk using Gemini 2.0 Flash with Search enabled"""
-        prompt = self.prompt_template.format(text=chunk.original_text)
-        response = await self.genai_client.models.generate_content(
-            model='gemini-2.0-flash-exp',
-            contents=prompt,
-            config=GenerateContentConfig(
-                tools=[self.google_search_tool],
-                response_modalities=["TEXT"],
-            )
-        )
-        # Get the main response text
-        chunk.processed_text = ""
-        for part in response.candidates[0].content.parts:
-            chunk.processed_text += part.text
-        # Log the search metadata for debugging/verification
-        if hasattr(response.candidates[0], 'grounding_metadata') and \
-           hasattr(response.candidates[0].grounding_metadata, 'search_entry_point'):
-            print(f"Search metadata found for chunk: {response.candidates[0].grounding_metadata.search_entry_point.rendered_content}")
-    async def process_transcript(self, audio_path: str) -> str:
-        """Main processing pipeline"""
-        # Transcribe audio
-        segments = await self.transcribe_audio(audio_path)
-        # Create chunks
-        chunks = self.create_chunks(segments)
-        # Process each chunk with retries
-        async def process_with_retry(chunk, max_retries=3):
-            for attempt in range(max_retries):
-                try:
-                    await self.process_chunk(chunk)
-                    return
-                except Exception as e:
-                    if attempt == max_retries - 1:
-                        print(f"Failed to process chunk after {max_retries} attempts: {e}")
-                        raise
-                    await asyncio.sleep(1 * (attempt + 1))  # Exponential backoff
-        # Process chunks in parallel with retries
-        tasks = [process_with_retry(chunk) for chunk in chunks]
-        await asyncio.gather(*tasks)
-        # Combine processed chunks
-        final_text = "\n".join(chunk.processed_text for chunk in chunks if chunk.processed_text)
-        return final_text
-async def main():
-    # Make sure to set these environment variables before running:
-    # export GOOGLE_API_KEY="your_google_api_key"
-    # export DEEPGRAM_API_KEY="your_deepgram_api_key"
-    processor = TranscriptProcessor()
-    # Example usage with either MP3 or WAV file
-    final_transcript = await processor.process_transcript("audio.mp3")
-    # Save as markdown file
-    with open("transcript.md", "w") as f:
-        f.write(final_transcript)

old/prompts.json DELETED Viewed

@@ -1,6 +0,0 @@
-{
-  "clips": "You are a social media expert for the Dwarkesh Podcast. Generate 10 viral-worthy clips from the transcript.\nFormat as:\nTweet 1\nTweet Text: [text]\nClip Transcript: [45-120 seconds of transcript]\n\nPrevious examples:\n{clips_examples}",
-  "description": "Create an engaging episode description tweet (280 chars max) that:\n1. Highlights compelling aspects\n2. Includes topic areas and handles\n3. Ends with \"Links below\" or \"Enjoy!\"\n\nPrevious examples:\n{description_examples}",
-  "timestamps": "Generate timestamps (HH:MM:SS) every 3-8 minutes covering key transitions and moments.\nUse 2-6 word descriptions.\nStart at 00:00:00.\n\nPrevious examples:\n{timestamps_examples}",
-  "titles_and_thumbnails": "Create 3-5 compelling title-thumbnail combinations that tell a story.\n\nTitle Format: \"Guest Name \u2013 Key Story or Core Insight\"\nThumbnail: 2-4 ALL CAPS words that create intrigue with the title\n\nExample: \"David Reich \u2013 How One Small Tribe Conquered the World 70,000 Years Ago\"\nThumbnail: \"LAST HUMANS STANDING\"\n\nThe combination should create intellectual curiosity without clickbait.\n\nPrevious examples:\n{titles_and_thumbnails_examples}"
-}

old/requirements.txt DELETED Viewed

@@ -1,4 +0,0 @@
-gradio>=4.0.0
-anthropic>=0.7.0
-pandas>=2.0.0
-youtube-transcript-api>=0.6.1

old/source/Titles.csv DELETED Viewed

@@ -1,12 +0,0 @@
-Titles
-"Sarah C. M. Paine - WW2, Taiwan, Ukraine, & Maritime vs Continental Powers"
-"David Reich – How One Small Tribe Conquered the World 70,000 Years Ago"
-"Mark Zuckerberg - Llama 3, $10B Models, Caesar Augustus, & 1 GW Datacenters"
-"Ilya Sutskever (OpenAI Chief Scientist) - Building AGI, Alignment, Spies, Microsoft, & Enlightenment"
-"Grant Sanderson ( @3blue1brown ) - Past, Present, & Future of Mathematics"
-Daniel Yergin – Oil Explains the Entire 20th Century
-"Leopold Aschenbrenner - 2027 AGI, China/US Super-Intelligence Race, & The Return of History"
-"Dario Amodei (Anthropic CEO) - $10 Billion Models, OpenAI, Scaling, & Alignment"
- @Asianometry  & Dylan Patel – How the Semiconductor Industry Actually Works
-"Demis Hassabis – Scaling, Superhuman AIs, AlphaZero atop LLMs, AlphaFold"
-Sholto Douglas & Trenton Bricken - How to Build & Understand GPT-7's Mind

source/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

{old/source → source}/Timestamps.csv RENAMED Viewed

File without changes

{old/source → source}/Titles & Thumbnails.csv RENAMED Viewed

File without changes

{old/source → source}/Viral Episode Descriptions.csv RENAMED Viewed

File without changes

{old/source → source}/Viral Twitter Clips.csv RENAMED Viewed

File without changes