from PyPDF2 import PdfFileReader from tiktoken import Tokenizer from tiktoken.models import GPT2 import gradio as gr def extract_text_from_pdf(file_path): with open(file_path, "rb") as file: pdf = PdfFileReader(file) text = "" for page_num in range(pdf.getNumPages()): text += pdf.getPage(page_num).extractText() return text def count_tokens(text): tokenizer = Tokenizer(GPT2()) tokens = tokenizer.tokenize(text) return len(tokens) def count_tokens_in_file(file): # Extract text from the PDF file paper_text = extract_text_from_pdf(file.name) return count_tokens(paper_text) with gr.Blocks() as demo: gr.Markdown("Upload your document to count their tokens") with gr.Tab("Upload PDF & TXT"): docs_input = gr.File(file_count="single", file_types=[".pdf"]) tb_tokenCount = gr.Textbox(label='Number of tokens') btn_count = gr.Button("Count token") btn_count.click(count_tokens_in_file,inputs=[docs_input],outputs=[tb_tokenCount])