llm_project

Build error

App Files Files Community

llm_project / data /scraping_scripts /csv_to_jsonl.py

Nullpointer-KK

Add files via upload

c9da9e4 unverified 15 days ago

raw

history blame contribute delete

2.09 kB

	import json
	import uuid

	import pandas as pd
	import tiktoken


	# Function to count tokens using tiktoken
	def num_tokens_from_string(string: str, encoding_name: str) -> int:
	encoding = tiktoken.get_encoding(encoding_name)
	num_tokens = len(
	encoding.encode(
	string, disallowed_special=(encoding.special_tokens_set - {"<\|endoftext\|>"})
	)
	)
	return num_tokens


	# Function to clean or remove specific content, e.g., copyright headers
	def remove_copyright_header(content: str) -> str:
	# Implement any cleaning logic you need here
	return content


	# Function to convert DataFrame to JSONL format with token counting
	def convert_to_jsonl_with_conditions(df, encoding_name="cl100k_base"):
	jsonl_data = []
	for _, row in df.iterrows():
	token_count = num_tokens_from_string(row["text"], encoding_name)

	# Skip entries based on token count conditions
	if token_count < 100 or token_count > 200_000:
	print(f"Skipping {row['title']} due to token count {token_count}")
	continue

	cleaned_content = remove_copyright_header(row["text"])

	entry = {
	"tokens": token_count, # Token count using tiktoken
	"doc_id": str(uuid.uuid4()), # Generate a unique UUID
	"name": row["title"],
	"url": row["tai_url"],
	"retrieve_doc": (token_count <= 8000), # retrieve_doc condition
	"source": "tai_blog",
	"content": cleaned_content,
	}
	jsonl_data.append(entry)
	return jsonl_data


	# Load the CSV file
	data = pd.read_csv("data/tai.csv")

	# Convert the dataframe to JSONL format with token counting and conditions
	jsonl_data_with_conditions = convert_to_jsonl_with_conditions(data)

	# Save the output to a new JSONL file using json.dumps to ensure proper escaping
	output_path = "data/tai_blog_data_conditions.jsonl"
	with open(output_path, "w") as f:
	for entry in jsonl_data_with_conditions:
	f.write(json.dumps(entry) + "\n")