Spaces:

atiwari751
/

Hindi-tokenizer

Sleeping

Hindi-tokenizer / data_analysis.py

Hindi tokenizer 101

d8b92ee 3 months ago

803 Bytes

	# Read text from a file
	with open('text_file.txt', 'r', encoding='utf-8') as file:
	text = file.read()

	tokens = text.encode("utf-8") # raw bytes
	tokens = list(map(int, tokens)) # convert to a list of integers in range 0..255 for convenience

	print('---')
	print("length of text:", len(text))
	print('---')
	#print(tokens)
	print('---')
	print("length of tokens:", len(tokens))

	def get_stats(ids):
	counts = {}
	for pair in zip(ids, ids[1:]): # Pythonic way to iterate consecutive elements
	counts[pair] = counts.get(pair, 0) + 1
	return counts

	stats = get_stats(tokens)
	print('---')
	# print(stats)
	#print(sorted(((v,k) for k,v in stats.items()), reverse=True))

	print('---')
	top_pair = max(stats, key=stats.get)
	print(top_pair)

	#print(chr(224), chr(164))