import gradio as gr from PyPDF4 import PdfFileReader import tiktoken def extract_text_from_pdf(file_path): with open(file_path, "rb") as file: pdf = PdfFileReader(file) text = "" for page_num in range(pdf.getNumPages()): text += pdf.getPage(page_num).extractText() return text def count_tokens(text): tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo") tokens = tokenizer.encode( text, disallowed_special=() ) return len(tokens) def count_tokens_in_file(file): # Extract text from the PDF file paper_text = extract_text_from_pdf(file.name) return count_tokens(paper_text) with gr.Blocks() as demo: gr.Markdown("Upload your document to count their tokens") with gr.Tab("Upload PDF & TXT"): docs_input = gr.File(file_count="single", file_types=[".pdf"]) tb_tokenCount = gr.Textbox(label='Number of tokens') docs_input.upload(count_tokens_in_file,inputs=[docs_input],outputs=[tb_tokenCount]) btn_count = gr.Button("Count token") btn_count.click(count_tokens_in_file,inputs=[docs_input],outputs=[tb_tokenCount]) #demo.queue() demo.launch(debug=True,share=False)