Spaces:

OrganizedProgrammers
/

TokenCounter

Runtime error

App Files Files Community

TokenCounter / app.py

Almaatla

Update app.py

6cfb094 over 1 year ago

raw

history blame

3.18 kB

	import gradio as gr
	from PyPDF4 import PdfFileReader
	import tiktoken

	def extract_text_from_pdf(file_path):
	with open(file_path, "rb") as file:
	pdf = PdfFileReader(file)
	text = ""
	for page_num in range(pdf.getNumPages()):
	text += pdf.getPage(page_num).extractText()
	return text

	def tokenize(text,model="gpt-3.5-turbo"):
	tokenizer = tiktoken.encoding_for_model(model)
	tokens = tokenizer.encode(
	text,
	disallowed_special=()
	)
	return tokens

	def count_tokens(text):
	return len(tokenize(text))

	def count_tokens_in_file(file):
	# Extract text from the PDF file
	paper_text = extract_text_from_pdf(file.name)
	return count_tokens(paper_text)

	def chunk_text(text, max_char, overlap):
	chunks = []
	start = 0
	end = max_char
	print(f"max char: {max_char}")
	while start < len(text):
	if end >= len(text):
	end = len(text)
	chunk = text[start:end]
	print(f"chunk[{start}:{end}] size: {count_tokens(chunk)} tokens")
	chunks.append(chunk)
	start += max_char - overlap
	end = start + max_char
	return chunks

	def chunk_file(file, max_char,overlap):
	# Extract text from the PDF file
	text = extract_text_from_pdf(file.name)
	chunks = chunk_text(text, max_char, overlap)

	return '\n\n[xxxxxxxxxxxxxxxxx]\n\n'.join(chunks)

	with gr.Blocks() as demo:
	gr.Markdown("Upload your document to count their tokens")
	with gr.Tab("Upload PDF"):
	docs_input = gr.File(file_count="single", file_types=[".pdf"])
	tb_tokenCount = gr.Textbox(label='Number of tokens')
	docs_input.upload(count_tokens_in_file,inputs=[docs_input],outputs=[tb_tokenCount])
	sl_max_char_per_chunk = gr.Slider(1000, 30000, value=2000, label="Number of characters", info="Choose a number of characters per chunk")
	sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size")

	btn_chunk = gr.Button("Chunk text")
	tb_chunked_text = gr.Textbox(label='Result')

	btn_chunk.click(chunk_file,inputs=[docs_input,sl_max_char_per_chunk,sl_overlap],outputs=[tb_chunked_text])
	with gr.Tab("Text"):
	text_input = gr.Textbox(label='Insert your text here')
	text_tb_tokenCount = gr.Textbox(label='Number of tokens')
	text_input.change(count_tokens,inputs=[text_input],outputs=[text_tb_tokenCount])
	text_sl_max_char_per_chunk = gr.Slider(1000, 30000, value=2000, label="Number of characters", info="Choose a number of characters per chunk")
	text_sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size")

	text_btn_chunk = gr.Button("Chunk text")
	text_tb_chunked_text = gr.Textbox(label='Result')
	def format_chunks(text,max_char,overlap):
	return '\n\n[xxxxxxxxxxxxxxxx]\n\n'.join(chunk_text(text,max_char,overlap))
	text_btn_chunk.click(format_chunks,
	inputs=[text_input,text_sl_max_char_per_chunk,text_sl_overlap],
	outputs=[text_tb_chunked_text])

	#demo.queue()
	demo.launch(debug=True,share=False)