dwarkesh commited on
Commit
eb91dd7
·
1 Parent(s): d3c00bf

adding links works!

Browse files
Files changed (2) hide show
  1. prompts/find_links.txt +27 -0
  2. scripts/add_links.py +209 -0
prompts/find_links.txt ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are an expert at identifying key terms, concepts, and proper nouns in text that would benefit from having reference links. Your task is to analyze the provided transcript text and identify terms that should be linked to provide additional context and information for readers.
2
+
3
+ Guidelines for identifying terms:
4
+ 1. Focus on technical terms, scientific concepts, proper nouns, and important ideas that readers might want to learn more about
5
+ 2. Avoid common words or phrases that don't need additional context
6
+ 3. For each term you identify, explain briefly why it would be valuable to link it
7
+ 4. If a term appears multiple times, only identify it once
8
+ 5. Limit your selection to the most important 3-5 terms per chunk of text
9
+
10
+ Respond in this format for each term:
11
+ TERM: <the exact term as it appears in text>
12
+ REASON: <1-2 sentences explaining why this term should be linked>
13
+
14
+ Example input:
15
+ "The quantum computer used superconducting qubits to achieve quantum supremacy, a milestone that Google's Sycamore processor reached in 2019."
16
+
17
+ Example output:
18
+ TERM: superconducting qubits
19
+ REASON: A fundamental building block of certain quantum computers that readers should understand to grasp quantum computing concepts.
20
+
21
+ TERM: quantum supremacy
22
+ REASON: A significant milestone in quantum computing that represents when quantum computers outperform classical computers.
23
+
24
+ TERM: Google's Sycamore
25
+ REASON: A specific quantum processor that made history, providing context about real-world quantum computing achievements.
26
+
27
+ Analyze the following transcript text and identify key terms that should be linked:
scripts/add_links.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from pathlib import Path
3
+ import os
4
+ import re
5
+ from typing import List, Dict, Tuple
6
+ from dataclasses import dataclass
7
+ import anthropic
8
+ from exa_py import Exa
9
+
10
+ @dataclass
11
+ class Term:
12
+ """A term identified for linking with its explanation"""
13
+ term: str
14
+ reason: str
15
+
16
+ @dataclass
17
+ class Link:
18
+ """A link found for a term"""
19
+ term: str
20
+ url: str
21
+ title: str
22
+
23
+ def chunk_text(text: str, max_chunk_size: int = 2000) -> List[str]:
24
+ """Split text into chunks of roughly equal size at paragraph boundaries"""
25
+ paragraphs = text.split("\n\n")
26
+ chunks = []
27
+ current_chunk = []
28
+ current_size = 0
29
+
30
+ for para in paragraphs:
31
+ para_size = len(para)
32
+ if current_size + para_size > max_chunk_size and current_chunk:
33
+ chunks.append("\n\n".join(current_chunk))
34
+ current_chunk = [para]
35
+ current_size = para_size
36
+ else:
37
+ current_chunk.append(para)
38
+ current_size += para_size
39
+
40
+ if current_chunk:
41
+ chunks.append("\n\n".join(current_chunk))
42
+
43
+ return chunks
44
+
45
+ def parse_claude_response(response: str) -> List[Term]:
46
+ """Parse Claude's response to extract terms and reasons"""
47
+ terms = []
48
+ current_term = None
49
+ current_reason = None
50
+
51
+ for line in response.split("\n"):
52
+ line = line.strip()
53
+ if not line:
54
+ continue
55
+
56
+ if line.startswith("TERM: "):
57
+ # Save previous term if exists
58
+ if current_term and current_reason:
59
+ terms.append(Term(current_term, current_reason))
60
+ current_term = line[6:].strip()
61
+ current_reason = None
62
+ elif line.startswith("REASON: "):
63
+ current_reason = line[8:].strip()
64
+
65
+ # Add final term
66
+ if current_term and current_reason:
67
+ terms.append(Term(current_term, current_reason))
68
+
69
+ return terms
70
+
71
+ def find_links_for_terms(exa: Exa, terms: List[Term]) -> Dict[str, Link]:
72
+ """Find best link for each term using Exa search"""
73
+ links = {}
74
+
75
+ for term in terms:
76
+ # Construct a search query that looks for authoritative sources
77
+ # query = f"The best explanation or overview of {term.term} is (site: wikipedia.org OR site: .edu OR site: .gov):"
78
+
79
+ try:
80
+ # Search with Exa
81
+ results = exa.search(term.term, num_results=1, type="auto")
82
+ if results.results:
83
+ result = results.results[0]
84
+ links[term.term] = Link(
85
+ term=term.term,
86
+ url=result.url,
87
+ title=result.title
88
+ )
89
+ except Exception as e:
90
+ print(f"Error finding link for {term.term}: {e}")
91
+ continue
92
+
93
+ return links
94
+
95
+ def add_links_to_text(text: str, links: Dict[str, Link]) -> str:
96
+ """Add markdown links to text for all terms we have links for"""
97
+ # Sort terms by length (descending) to handle overlapping terms correctly
98
+ terms = sorted(links.keys(), key=len, reverse=True)
99
+
100
+ # Create regex pattern that matches whole words only
101
+ patterns = [re.compile(fr'\b{re.escape(term)}\b') for term in terms]
102
+
103
+ # Track which terms we've linked to avoid duplicate links
104
+ linked_terms = set()
105
+
106
+ # Process each term
107
+ result = text
108
+ for term, pattern in zip(terms, patterns):
109
+ if term in linked_terms:
110
+ continue
111
+
112
+ # Only replace first occurrence
113
+ link = links[term]
114
+ replacement = f"[{term}]({link.url})"
115
+ result = pattern.sub(replacement, result, count=1)
116
+ linked_terms.add(term)
117
+
118
+ return result
119
+
120
+ def process_transcript(
121
+ transcript_path: Path,
122
+ claude_client: anthropic.Client,
123
+ exa_client: Exa,
124
+ prompt_template: str
125
+ ) -> str:
126
+ """Process a transcript file to add reference links"""
127
+ # Read transcript
128
+ text = transcript_path.read_text()
129
+
130
+ # Split into chunks
131
+ chunks = chunk_text(text)
132
+
133
+ # Process each chunk
134
+ all_terms = []
135
+ for chunk in chunks:
136
+ # Get Claude's suggestions
137
+ prompt = prompt_template + "\n\n" + chunk
138
+ response = claude_client.messages.create(
139
+ model="claude-3-5-sonnet-20241022",
140
+ max_tokens=1024,
141
+ system="You are a helpful AI assistant.",
142
+ messages=[{"role": "user", "content": prompt}]
143
+ )
144
+
145
+ # Parse response
146
+ terms = parse_claude_response(response.content[0].text)
147
+ all_terms.extend(terms)
148
+
149
+ # Find links for all terms
150
+ links = find_links_for_terms(exa_client, all_terms)
151
+
152
+ # Add links to text
153
+ linked_text = add_links_to_text(text, links)
154
+
155
+ return linked_text
156
+
157
+ def main():
158
+ parser = argparse.ArgumentParser()
159
+ parser.add_argument(
160
+ "transcript",
161
+ nargs="?", # Make the argument optional
162
+ default="output/transcripts/transcript.md",
163
+ help="Path to transcript file (default: output/transcripts/transcript.md)"
164
+ )
165
+ parser.add_argument("--output", help="Output file path (default: input path with -linked suffix)")
166
+ args = parser.parse_args()
167
+
168
+ transcript_path = Path(args.transcript)
169
+ if not transcript_path.exists():
170
+ raise FileNotFoundError(f"Transcript file not found: {transcript_path}")
171
+
172
+ # Set up output path
173
+ if args.output:
174
+ output_path = Path(args.output)
175
+ else:
176
+ stem = transcript_path.stem
177
+ output_path = transcript_path.parent / f"{stem}-linked{transcript_path.suffix}"
178
+
179
+ # Read prompt template
180
+ prompt_path = Path("prompts/find_links.txt")
181
+ if not prompt_path.exists():
182
+ raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
183
+ prompt_template = prompt_path.read_text()
184
+
185
+ # Initialize clients
186
+ claude_client = anthropic.Client(api_key=os.getenv("ANTHROPIC_API_KEY"))
187
+ exa_client = Exa(api_key=os.getenv("EXA_API_KEY"))
188
+
189
+ try:
190
+ # Process transcript
191
+ linked_text = process_transcript(
192
+ transcript_path,
193
+ claude_client,
194
+ exa_client,
195
+ prompt_template
196
+ )
197
+
198
+ # Save output
199
+ output_path.write_text(linked_text)
200
+ print(f"Processed transcript saved to: {output_path}")
201
+
202
+ except Exception as e:
203
+ print(f"Error processing transcript: {e}")
204
+ return 1
205
+
206
+ return 0
207
+
208
+ if __name__ == "__main__":
209
+ main()