adding links works!
Browse files- prompts/find_links.txt +27 -0
- scripts/add_links.py +209 -0
prompts/find_links.txt
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
You are an expert at identifying key terms, concepts, and proper nouns in text that would benefit from having reference links. Your task is to analyze the provided transcript text and identify terms that should be linked to provide additional context and information for readers.
|
2 |
+
|
3 |
+
Guidelines for identifying terms:
|
4 |
+
1. Focus on technical terms, scientific concepts, proper nouns, and important ideas that readers might want to learn more about
|
5 |
+
2. Avoid common words or phrases that don't need additional context
|
6 |
+
3. For each term you identify, explain briefly why it would be valuable to link it
|
7 |
+
4. If a term appears multiple times, only identify it once
|
8 |
+
5. Limit your selection to the most important 3-5 terms per chunk of text
|
9 |
+
|
10 |
+
Respond in this format for each term:
|
11 |
+
TERM: <the exact term as it appears in text>
|
12 |
+
REASON: <1-2 sentences explaining why this term should be linked>
|
13 |
+
|
14 |
+
Example input:
|
15 |
+
"The quantum computer used superconducting qubits to achieve quantum supremacy, a milestone that Google's Sycamore processor reached in 2019."
|
16 |
+
|
17 |
+
Example output:
|
18 |
+
TERM: superconducting qubits
|
19 |
+
REASON: A fundamental building block of certain quantum computers that readers should understand to grasp quantum computing concepts.
|
20 |
+
|
21 |
+
TERM: quantum supremacy
|
22 |
+
REASON: A significant milestone in quantum computing that represents when quantum computers outperform classical computers.
|
23 |
+
|
24 |
+
TERM: Google's Sycamore
|
25 |
+
REASON: A specific quantum processor that made history, providing context about real-world quantum computing achievements.
|
26 |
+
|
27 |
+
Analyze the following transcript text and identify key terms that should be linked:
|
scripts/add_links.py
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
from pathlib import Path
|
3 |
+
import os
|
4 |
+
import re
|
5 |
+
from typing import List, Dict, Tuple
|
6 |
+
from dataclasses import dataclass
|
7 |
+
import anthropic
|
8 |
+
from exa_py import Exa
|
9 |
+
|
10 |
+
@dataclass
|
11 |
+
class Term:
|
12 |
+
"""A term identified for linking with its explanation"""
|
13 |
+
term: str
|
14 |
+
reason: str
|
15 |
+
|
16 |
+
@dataclass
|
17 |
+
class Link:
|
18 |
+
"""A link found for a term"""
|
19 |
+
term: str
|
20 |
+
url: str
|
21 |
+
title: str
|
22 |
+
|
23 |
+
def chunk_text(text: str, max_chunk_size: int = 2000) -> List[str]:
|
24 |
+
"""Split text into chunks of roughly equal size at paragraph boundaries"""
|
25 |
+
paragraphs = text.split("\n\n")
|
26 |
+
chunks = []
|
27 |
+
current_chunk = []
|
28 |
+
current_size = 0
|
29 |
+
|
30 |
+
for para in paragraphs:
|
31 |
+
para_size = len(para)
|
32 |
+
if current_size + para_size > max_chunk_size and current_chunk:
|
33 |
+
chunks.append("\n\n".join(current_chunk))
|
34 |
+
current_chunk = [para]
|
35 |
+
current_size = para_size
|
36 |
+
else:
|
37 |
+
current_chunk.append(para)
|
38 |
+
current_size += para_size
|
39 |
+
|
40 |
+
if current_chunk:
|
41 |
+
chunks.append("\n\n".join(current_chunk))
|
42 |
+
|
43 |
+
return chunks
|
44 |
+
|
45 |
+
def parse_claude_response(response: str) -> List[Term]:
|
46 |
+
"""Parse Claude's response to extract terms and reasons"""
|
47 |
+
terms = []
|
48 |
+
current_term = None
|
49 |
+
current_reason = None
|
50 |
+
|
51 |
+
for line in response.split("\n"):
|
52 |
+
line = line.strip()
|
53 |
+
if not line:
|
54 |
+
continue
|
55 |
+
|
56 |
+
if line.startswith("TERM: "):
|
57 |
+
# Save previous term if exists
|
58 |
+
if current_term and current_reason:
|
59 |
+
terms.append(Term(current_term, current_reason))
|
60 |
+
current_term = line[6:].strip()
|
61 |
+
current_reason = None
|
62 |
+
elif line.startswith("REASON: "):
|
63 |
+
current_reason = line[8:].strip()
|
64 |
+
|
65 |
+
# Add final term
|
66 |
+
if current_term and current_reason:
|
67 |
+
terms.append(Term(current_term, current_reason))
|
68 |
+
|
69 |
+
return terms
|
70 |
+
|
71 |
+
def find_links_for_terms(exa: Exa, terms: List[Term]) -> Dict[str, Link]:
|
72 |
+
"""Find best link for each term using Exa search"""
|
73 |
+
links = {}
|
74 |
+
|
75 |
+
for term in terms:
|
76 |
+
# Construct a search query that looks for authoritative sources
|
77 |
+
# query = f"The best explanation or overview of {term.term} is (site: wikipedia.org OR site: .edu OR site: .gov):"
|
78 |
+
|
79 |
+
try:
|
80 |
+
# Search with Exa
|
81 |
+
results = exa.search(term.term, num_results=1, type="auto")
|
82 |
+
if results.results:
|
83 |
+
result = results.results[0]
|
84 |
+
links[term.term] = Link(
|
85 |
+
term=term.term,
|
86 |
+
url=result.url,
|
87 |
+
title=result.title
|
88 |
+
)
|
89 |
+
except Exception as e:
|
90 |
+
print(f"Error finding link for {term.term}: {e}")
|
91 |
+
continue
|
92 |
+
|
93 |
+
return links
|
94 |
+
|
95 |
+
def add_links_to_text(text: str, links: Dict[str, Link]) -> str:
|
96 |
+
"""Add markdown links to text for all terms we have links for"""
|
97 |
+
# Sort terms by length (descending) to handle overlapping terms correctly
|
98 |
+
terms = sorted(links.keys(), key=len, reverse=True)
|
99 |
+
|
100 |
+
# Create regex pattern that matches whole words only
|
101 |
+
patterns = [re.compile(fr'\b{re.escape(term)}\b') for term in terms]
|
102 |
+
|
103 |
+
# Track which terms we've linked to avoid duplicate links
|
104 |
+
linked_terms = set()
|
105 |
+
|
106 |
+
# Process each term
|
107 |
+
result = text
|
108 |
+
for term, pattern in zip(terms, patterns):
|
109 |
+
if term in linked_terms:
|
110 |
+
continue
|
111 |
+
|
112 |
+
# Only replace first occurrence
|
113 |
+
link = links[term]
|
114 |
+
replacement = f"[{term}]({link.url})"
|
115 |
+
result = pattern.sub(replacement, result, count=1)
|
116 |
+
linked_terms.add(term)
|
117 |
+
|
118 |
+
return result
|
119 |
+
|
120 |
+
def process_transcript(
|
121 |
+
transcript_path: Path,
|
122 |
+
claude_client: anthropic.Client,
|
123 |
+
exa_client: Exa,
|
124 |
+
prompt_template: str
|
125 |
+
) -> str:
|
126 |
+
"""Process a transcript file to add reference links"""
|
127 |
+
# Read transcript
|
128 |
+
text = transcript_path.read_text()
|
129 |
+
|
130 |
+
# Split into chunks
|
131 |
+
chunks = chunk_text(text)
|
132 |
+
|
133 |
+
# Process each chunk
|
134 |
+
all_terms = []
|
135 |
+
for chunk in chunks:
|
136 |
+
# Get Claude's suggestions
|
137 |
+
prompt = prompt_template + "\n\n" + chunk
|
138 |
+
response = claude_client.messages.create(
|
139 |
+
model="claude-3-5-sonnet-20241022",
|
140 |
+
max_tokens=1024,
|
141 |
+
system="You are a helpful AI assistant.",
|
142 |
+
messages=[{"role": "user", "content": prompt}]
|
143 |
+
)
|
144 |
+
|
145 |
+
# Parse response
|
146 |
+
terms = parse_claude_response(response.content[0].text)
|
147 |
+
all_terms.extend(terms)
|
148 |
+
|
149 |
+
# Find links for all terms
|
150 |
+
links = find_links_for_terms(exa_client, all_terms)
|
151 |
+
|
152 |
+
# Add links to text
|
153 |
+
linked_text = add_links_to_text(text, links)
|
154 |
+
|
155 |
+
return linked_text
|
156 |
+
|
157 |
+
def main():
|
158 |
+
parser = argparse.ArgumentParser()
|
159 |
+
parser.add_argument(
|
160 |
+
"transcript",
|
161 |
+
nargs="?", # Make the argument optional
|
162 |
+
default="output/transcripts/transcript.md",
|
163 |
+
help="Path to transcript file (default: output/transcripts/transcript.md)"
|
164 |
+
)
|
165 |
+
parser.add_argument("--output", help="Output file path (default: input path with -linked suffix)")
|
166 |
+
args = parser.parse_args()
|
167 |
+
|
168 |
+
transcript_path = Path(args.transcript)
|
169 |
+
if not transcript_path.exists():
|
170 |
+
raise FileNotFoundError(f"Transcript file not found: {transcript_path}")
|
171 |
+
|
172 |
+
# Set up output path
|
173 |
+
if args.output:
|
174 |
+
output_path = Path(args.output)
|
175 |
+
else:
|
176 |
+
stem = transcript_path.stem
|
177 |
+
output_path = transcript_path.parent / f"{stem}-linked{transcript_path.suffix}"
|
178 |
+
|
179 |
+
# Read prompt template
|
180 |
+
prompt_path = Path("prompts/find_links.txt")
|
181 |
+
if not prompt_path.exists():
|
182 |
+
raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
|
183 |
+
prompt_template = prompt_path.read_text()
|
184 |
+
|
185 |
+
# Initialize clients
|
186 |
+
claude_client = anthropic.Client(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
187 |
+
exa_client = Exa(api_key=os.getenv("EXA_API_KEY"))
|
188 |
+
|
189 |
+
try:
|
190 |
+
# Process transcript
|
191 |
+
linked_text = process_transcript(
|
192 |
+
transcript_path,
|
193 |
+
claude_client,
|
194 |
+
exa_client,
|
195 |
+
prompt_template
|
196 |
+
)
|
197 |
+
|
198 |
+
# Save output
|
199 |
+
output_path.write_text(linked_text)
|
200 |
+
print(f"Processed transcript saved to: {output_path}")
|
201 |
+
|
202 |
+
except Exception as e:
|
203 |
+
print(f"Error processing transcript: {e}")
|
204 |
+
return 1
|
205 |
+
|
206 |
+
return 0
|
207 |
+
|
208 |
+
if __name__ == "__main__":
|
209 |
+
main()
|