import gradio as gr from PyPDF4 import PdfFileReader import tiktoken def extract_text_from_pdf(file_path): with open(file_path, "rb") as file: pdf = PdfFileReader(file) text = "" for page_num in range(pdf.getNumPages()): text += pdf.getPage(page_num).extractText() return text def tokenize(text,model="gpt-3.5-turbo"): tokenizer = tiktoken.encoding_for_model(model) tokens = tokenizer.encode( text, disallowed_special=() ) return tokens def count_tokens(text): return len(tokenize(text)) def count_tokens_in_file(file): # Extract text from the PDF file paper_text = extract_text_from_pdf(file.name) return count_tokens(paper_text) def chunk_text(text, max_char, overlap): chunks = [] start = 0 end = max_char print(f"max char: {max_char}") while start < len(text): if end >= len(text): end = len(text) chunk = text[start:end] print(f"chunk[{start}:{end}] size: {count_tokens(chunk)} tokens") chunks.append(chunk) start += max_char - overlap end = start + max_char return chunks def chunk_file(file, max_char,overlap): # Extract text from the PDF file text = extract_text_from_pdf(file.name) chunks = chunk_text(text, max_char, overlap) return '\n\n[xxxxxxxxxxxxxxxxx]\n\n'.join(chunks) with gr.Blocks() as demo: gr.Markdown("Upload your document to count their tokens") with gr.Tab("Upload PDF"): docs_input = gr.File(file_count="single", file_types=[".pdf"]) tb_tokenCount = gr.Textbox(label='Number of tokens') docs_input.upload(count_tokens_in_file,inputs=[docs_input],outputs=[tb_tokenCount]) sl_max_char_per_chunk = gr.Slider(1000, 30000, value=2000, label="Number of characters", info="Choose a number of characters per chunk") sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size") btn_chunk = gr.Button("Chunk text") tb_chunked_text = gr.Textbox(label='Result') btn_chunk.click(chunk_file,inputs=[docs_input,sl_max_char_per_chunk,sl_overlap],outputs=[tb_chunked_text]) with gr.Tab("Text"): text_input = gr.Textbox(label='Insert your text here') text_tb_tokenCount = gr.Textbox(label='Number of tokens') text_input.change(count_tokens,inputs=[text_input],outputs=[text_tb_tokenCount]) text_sl_max_char_per_chunk = gr.Slider(1000, 30000, value=2000, label="Number of characters", info="Choose a number of characters per chunk") text_sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size") text_btn_chunk = gr.Button("Chunk text") text_tb_chunked_text = gr.Textbox(label='Result') def format_chunks(text,max_char,overlap): return '\n\n[xxxxxxxxxxxxxxxx]\n\n'.join(chunk_text(text,max_char,overlap)) text_btn_chunk.click(format_chunks, inputs=[text_input,text_sl_max_char_per_chunk,text_sl_overlap], outputs=[text_tb_chunked_text]) #demo.queue() demo.launch(debug=True,share=False)