File size: 6,470 Bytes
eb91dd7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
import argparse
from pathlib import Path
import os
import re
from typing import List, Dict, Tuple
from dataclasses import dataclass
import anthropic
from exa_py import Exa
@dataclass
class Term:
"""A term identified for linking with its explanation"""
term: str
reason: str
@dataclass
class Link:
"""A link found for a term"""
term: str
url: str
title: str
def chunk_text(text: str, max_chunk_size: int = 2000) -> List[str]:
"""Split text into chunks of roughly equal size at paragraph boundaries"""
paragraphs = text.split("\n\n")
chunks = []
current_chunk = []
current_size = 0
for para in paragraphs:
para_size = len(para)
if current_size + para_size > max_chunk_size and current_chunk:
chunks.append("\n\n".join(current_chunk))
current_chunk = [para]
current_size = para_size
else:
current_chunk.append(para)
current_size += para_size
if current_chunk:
chunks.append("\n\n".join(current_chunk))
return chunks
def parse_claude_response(response: str) -> List[Term]:
"""Parse Claude's response to extract terms and reasons"""
terms = []
current_term = None
current_reason = None
for line in response.split("\n"):
line = line.strip()
if not line:
continue
if line.startswith("TERM: "):
# Save previous term if exists
if current_term and current_reason:
terms.append(Term(current_term, current_reason))
current_term = line[6:].strip()
current_reason = None
elif line.startswith("REASON: "):
current_reason = line[8:].strip()
# Add final term
if current_term and current_reason:
terms.append(Term(current_term, current_reason))
return terms
def find_links_for_terms(exa: Exa, terms: List[Term]) -> Dict[str, Link]:
"""Find best link for each term using Exa search"""
links = {}
for term in terms:
# Construct a search query that looks for authoritative sources
# query = f"The best explanation or overview of {term.term} is (site: wikipedia.org OR site: .edu OR site: .gov):"
try:
# Search with Exa
results = exa.search(term.term, num_results=1, type="auto")
if results.results:
result = results.results[0]
links[term.term] = Link(
term=term.term,
url=result.url,
title=result.title
)
except Exception as e:
print(f"Error finding link for {term.term}: {e}")
continue
return links
def add_links_to_text(text: str, links: Dict[str, Link]) -> str:
"""Add markdown links to text for all terms we have links for"""
# Sort terms by length (descending) to handle overlapping terms correctly
terms = sorted(links.keys(), key=len, reverse=True)
# Create regex pattern that matches whole words only
patterns = [re.compile(fr'\b{re.escape(term)}\b') for term in terms]
# Track which terms we've linked to avoid duplicate links
linked_terms = set()
# Process each term
result = text
for term, pattern in zip(terms, patterns):
if term in linked_terms:
continue
# Only replace first occurrence
link = links[term]
replacement = f"[{term}]({link.url})"
result = pattern.sub(replacement, result, count=1)
linked_terms.add(term)
return result
def process_transcript(
transcript_path: Path,
claude_client: anthropic.Client,
exa_client: Exa,
prompt_template: str
) -> str:
"""Process a transcript file to add reference links"""
# Read transcript
text = transcript_path.read_text()
# Split into chunks
chunks = chunk_text(text)
# Process each chunk
all_terms = []
for chunk in chunks:
# Get Claude's suggestions
prompt = prompt_template + "\n\n" + chunk
response = claude_client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1024,
system="You are a helpful AI assistant.",
messages=[{"role": "user", "content": prompt}]
)
# Parse response
terms = parse_claude_response(response.content[0].text)
all_terms.extend(terms)
# Find links for all terms
links = find_links_for_terms(exa_client, all_terms)
# Add links to text
linked_text = add_links_to_text(text, links)
return linked_text
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"transcript",
nargs="?", # Make the argument optional
default="output/transcripts/transcript.md",
help="Path to transcript file (default: output/transcripts/transcript.md)"
)
parser.add_argument("--output", help="Output file path (default: input path with -linked suffix)")
args = parser.parse_args()
transcript_path = Path(args.transcript)
if not transcript_path.exists():
raise FileNotFoundError(f"Transcript file not found: {transcript_path}")
# Set up output path
if args.output:
output_path = Path(args.output)
else:
stem = transcript_path.stem
output_path = transcript_path.parent / f"{stem}-linked{transcript_path.suffix}"
# Read prompt template
prompt_path = Path("prompts/find_links.txt")
if not prompt_path.exists():
raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
prompt_template = prompt_path.read_text()
# Initialize clients
claude_client = anthropic.Client(api_key=os.getenv("ANTHROPIC_API_KEY"))
exa_client = Exa(api_key=os.getenv("EXA_API_KEY"))
try:
# Process transcript
linked_text = process_transcript(
transcript_path,
claude_client,
exa_client,
prompt_template
)
# Save output
output_path.write_text(linked_text)
print(f"Processed transcript saved to: {output_path}")
except Exception as e:
print(f"Error processing transcript: {e}")
return 1
return 0
if __name__ == "__main__":
main() |