decoder-only-transformer-learning / shakespeare_data.py

decoder only transformer learning

ca9f11d 3 months ago

1.94 kB

	'''Ref: https://github.com/karpathy/nanoGPT/blob/master/data/shakespeare/prepare.py
	'''
	import os
	import requests
	import numpy as np
	from transformers import GPT2Tokenizer
	from shakespeare_config import get_data_folder_path, get_config, get_gpt2_tokenizer
	from pathlib import Path

	if __name__=='__main__':
	config=get_config()
	data_folder_path = get_data_folder_path(config=config)
	# download the tiny shakespeare dataset
	input_file_path = os.path.join(data_folder_path, 'input.txt')
	tokenizer:GPT2Tokenizer = get_gpt2_tokenizer(config=config)

	print(tokenizer.model_max_length)

	if not Path(input_file_path).exists():
	data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
	with open(input_file_path, 'w', encoding='utf-8') as f:
	f.write(requests.get(data_url).text)

	data=''
	with open(input_file_path, 'r', encoding='utf-8') as f:
	for line in f.readlines():
	if len(line.rstrip())>0:
	data += ' ' + line

	print(data)
	n = len(data)
	train_split = int(n*0.9)
	train_data = data[:train_split]
	test_data = data[train_split:]

	train_ids = tokenizer.encode(train_data)
	test_ids = tokenizer.encode(test_data)
	print(f"train has {len(train_ids):,} tokens")
	print(f"test has {len(test_ids):,} tokens")

	# export to bin files
	train_ids = np.array(train_ids, dtype=np.uint16)
	test_ids = np.array(test_ids, dtype=np.uint16)
	train_ids.tofile(os.path.join(data_folder_path, 'train.bin'))
	test_ids.tofile(os.path.join(data_folder_path, 'test.bin'))
	# train has 292,080 tokens
	# test has 34,806 tokens
	print(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id))
	print(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id))