Spaces:

dwarkesh
/

producer

Running

App Files Files Community

producer / scripts /add_links.py

dwarkesh

adding links works!

eb91dd7 6 months ago

raw

history blame

6.47 kB

	import argparse
	from pathlib import Path
	import os
	import re
	from typing import List, Dict, Tuple
	from dataclasses import dataclass
	import anthropic
	from exa_py import Exa

	@dataclass
	class Term:
	"""A term identified for linking with its explanation"""
	term: str
	reason: str

	@dataclass
	class Link:
	"""A link found for a term"""
	term: str
	url: str
	title: str

	def chunk_text(text: str, max_chunk_size: int = 2000) -> List[str]:
	"""Split text into chunks of roughly equal size at paragraph boundaries"""
	paragraphs = text.split("\n\n")
	chunks = []
	current_chunk = []
	current_size = 0

	for para in paragraphs:
	para_size = len(para)
	if current_size + para_size > max_chunk_size and current_chunk:
	chunks.append("\n\n".join(current_chunk))
	current_chunk = [para]
	current_size = para_size
	else:
	current_chunk.append(para)
	current_size += para_size

	if current_chunk:
	chunks.append("\n\n".join(current_chunk))

	return chunks

	def parse_claude_response(response: str) -> List[Term]:
	"""Parse Claude's response to extract terms and reasons"""
	terms = []
	current_term = None
	current_reason = None

	for line in response.split("\n"):
	line = line.strip()
	if not line:
	continue

	if line.startswith("TERM: "):
	# Save previous term if exists
	if current_term and current_reason:
	terms.append(Term(current_term, current_reason))
	current_term = line[6:].strip()
	current_reason = None
	elif line.startswith("REASON: "):
	current_reason = line[8:].strip()

	# Add final term
	if current_term and current_reason:
	terms.append(Term(current_term, current_reason))

	return terms

	def find_links_for_terms(exa: Exa, terms: List[Term]) -> Dict[str, Link]:
	"""Find best link for each term using Exa search"""
	links = {}

	for term in terms:
	# Construct a search query that looks for authoritative sources
	# query = f"The best explanation or overview of {term.term} is (site: wikipedia.org OR site: .edu OR site: .gov):"

	try:
	# Search with Exa
	results = exa.search(term.term, num_results=1, type="auto")
	if results.results:
	result = results.results[0]
	links[term.term] = Link(
	term=term.term,
	url=result.url,
	title=result.title
	)
	except Exception as e:
	print(f"Error finding link for {term.term}: {e}")
	continue

	return links

	def add_links_to_text(text: str, links: Dict[str, Link]) -> str:
	"""Add markdown links to text for all terms we have links for"""
	# Sort terms by length (descending) to handle overlapping terms correctly
	terms = sorted(links.keys(), key=len, reverse=True)

	# Create regex pattern that matches whole words only
	patterns = [re.compile(fr'\b{re.escape(term)}\b') for term in terms]

	# Track which terms we've linked to avoid duplicate links
	linked_terms = set()

	# Process each term
	result = text
	for term, pattern in zip(terms, patterns):
	if term in linked_terms:
	continue

	# Only replace first occurrence
	link = links[term]
	replacement = f"[{term}]({link.url})"
	result = pattern.sub(replacement, result, count=1)
	linked_terms.add(term)

	return result

	def process_transcript(
	transcript_path: Path,
	claude_client: anthropic.Client,
	exa_client: Exa,
	prompt_template: str
	) -> str:
	"""Process a transcript file to add reference links"""
	# Read transcript
	text = transcript_path.read_text()

	# Split into chunks
	chunks = chunk_text(text)

	# Process each chunk
	all_terms = []
	for chunk in chunks:
	# Get Claude's suggestions
	prompt = prompt_template + "\n\n" + chunk
	response = claude_client.messages.create(
	model="claude-3-5-sonnet-20241022",
	max_tokens=1024,
	system="You are a helpful AI assistant.",
	messages=[{"role": "user", "content": prompt}]
	)

	# Parse response
	terms = parse_claude_response(response.content[0].text)
	all_terms.extend(terms)

	# Find links for all terms
	links = find_links_for_terms(exa_client, all_terms)

	# Add links to text
	linked_text = add_links_to_text(text, links)

	return linked_text

	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"transcript",
	nargs="?", # Make the argument optional
	default="output/transcripts/transcript.md",
	help="Path to transcript file (default: output/transcripts/transcript.md)"
	)
	parser.add_argument("--output", help="Output file path (default: input path with -linked suffix)")
	args = parser.parse_args()

	transcript_path = Path(args.transcript)
	if not transcript_path.exists():
	raise FileNotFoundError(f"Transcript file not found: {transcript_path}")

	# Set up output path
	if args.output:
	output_path = Path(args.output)
	else:
	stem = transcript_path.stem
	output_path = transcript_path.parent / f"{stem}-linked{transcript_path.suffix}"

	# Read prompt template
	prompt_path = Path("prompts/find_links.txt")
	if not prompt_path.exists():
	raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
	prompt_template = prompt_path.read_text()

	# Initialize clients
	claude_client = anthropic.Client(api_key=os.getenv("ANTHROPIC_API_KEY"))
	exa_client = Exa(api_key=os.getenv("EXA_API_KEY"))

	try:
	# Process transcript
	linked_text = process_transcript(
	transcript_path,
	claude_client,
	exa_client,
	prompt_template
	)

	# Save output
	output_path.write_text(linked_text)
	print(f"Processed transcript saved to: {output_path}")

	except Exception as e:
	print(f"Error processing transcript: {e}")
	return 1

	return 0

	if __name__ == "__main__":
	main()