TokenCounter / app.py
Almaatla's picture
Update app.py
6cfb094
raw
history blame
3.18 kB
import gradio as gr
from PyPDF4 import PdfFileReader
import tiktoken
def extract_text_from_pdf(file_path):
with open(file_path, "rb") as file:
pdf = PdfFileReader(file)
text = ""
for page_num in range(pdf.getNumPages()):
text += pdf.getPage(page_num).extractText()
return text
def tokenize(text,model="gpt-3.5-turbo"):
tokenizer = tiktoken.encoding_for_model(model)
tokens = tokenizer.encode(
text,
disallowed_special=()
)
return tokens
def count_tokens(text):
return len(tokenize(text))
def count_tokens_in_file(file):
# Extract text from the PDF file
paper_text = extract_text_from_pdf(file.name)
return count_tokens(paper_text)
def chunk_text(text, max_char, overlap):
chunks = []
start = 0
end = max_char
print(f"max char: {max_char}")
while start < len(text):
if end >= len(text):
end = len(text)
chunk = text[start:end]
print(f"chunk[{start}:{end}] size: {count_tokens(chunk)} tokens")
chunks.append(chunk)
start += max_char - overlap
end = start + max_char
return chunks
def chunk_file(file, max_char,overlap):
# Extract text from the PDF file
text = extract_text_from_pdf(file.name)
chunks = chunk_text(text, max_char, overlap)
return '\n\n[xxxxxxxxxxxxxxxxx]\n\n'.join(chunks)
with gr.Blocks() as demo:
gr.Markdown("Upload your document to count their tokens")
with gr.Tab("Upload PDF"):
docs_input = gr.File(file_count="single", file_types=[".pdf"])
tb_tokenCount = gr.Textbox(label='Number of tokens')
docs_input.upload(count_tokens_in_file,inputs=[docs_input],outputs=[tb_tokenCount])
sl_max_char_per_chunk = gr.Slider(1000, 30000, value=2000, label="Number of characters", info="Choose a number of characters per chunk")
sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size")
btn_chunk = gr.Button("Chunk text")
tb_chunked_text = gr.Textbox(label='Result')
btn_chunk.click(chunk_file,inputs=[docs_input,sl_max_char_per_chunk,sl_overlap],outputs=[tb_chunked_text])
with gr.Tab("Text"):
text_input = gr.Textbox(label='Insert your text here')
text_tb_tokenCount = gr.Textbox(label='Number of tokens')
text_input.change(count_tokens,inputs=[text_input],outputs=[text_tb_tokenCount])
text_sl_max_char_per_chunk = gr.Slider(1000, 30000, value=2000, label="Number of characters", info="Choose a number of characters per chunk")
text_sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size")
text_btn_chunk = gr.Button("Chunk text")
text_tb_chunked_text = gr.Textbox(label='Result')
def format_chunks(text,max_char,overlap):
return '\n\n[xxxxxxxxxxxxxxxx]\n\n'.join(chunk_text(text,max_char,overlap))
text_btn_chunk.click(format_chunks,
inputs=[text_input,text_sl_max_char_per_chunk,text_sl_overlap],
outputs=[text_tb_chunked_text])
#demo.queue()
demo.launch(debug=True,share=False)