Spaces:
Build error
Build error
import json | |
import uuid | |
import pandas as pd | |
import tiktoken | |
# Function to count tokens using tiktoken | |
def num_tokens_from_string(string: str, encoding_name: str) -> int: | |
encoding = tiktoken.get_encoding(encoding_name) | |
num_tokens = len( | |
encoding.encode( | |
string, disallowed_special=(encoding.special_tokens_set - {"<|endoftext|>"}) | |
) | |
) | |
return num_tokens | |
# Function to clean or remove specific content, e.g., copyright headers | |
def remove_copyright_header(content: str) -> str: | |
# Implement any cleaning logic you need here | |
return content | |
# Function to convert DataFrame to JSONL format with token counting | |
def convert_to_jsonl_with_conditions(df, encoding_name="cl100k_base"): | |
jsonl_data = [] | |
for _, row in df.iterrows(): | |
token_count = num_tokens_from_string(row["text"], encoding_name) | |
# Skip entries based on token count conditions | |
if token_count < 100 or token_count > 200_000: | |
print(f"Skipping {row['title']} due to token count {token_count}") | |
continue | |
cleaned_content = remove_copyright_header(row["text"]) | |
entry = { | |
"tokens": token_count, # Token count using tiktoken | |
"doc_id": str(uuid.uuid4()), # Generate a unique UUID | |
"name": row["title"], | |
"url": row["tai_url"], | |
"retrieve_doc": (token_count <= 8000), # retrieve_doc condition | |
"source": "tai_blog", | |
"content": cleaned_content, | |
} | |
jsonl_data.append(entry) | |
return jsonl_data | |
# Load the CSV file | |
data = pd.read_csv("data/tai.csv") | |
# Convert the dataframe to JSONL format with token counting and conditions | |
jsonl_data_with_conditions = convert_to_jsonl_with_conditions(data) | |
# Save the output to a new JSONL file using json.dumps to ensure proper escaping | |
output_path = "data/tai_blog_data_conditions.jsonl" | |
with open(output_path, "w") as f: | |
for entry in jsonl_data_with_conditions: | |
f.write(json.dumps(entry) + "\n") | |