Spaces:
Runtime error
Runtime error
import gradio as gr | |
from PyPDF4 import PdfFileReader | |
import tiktoken | |
def extract_text_from_pdf(file_path): | |
with open(file_path, "rb") as file: | |
pdf = PdfFileReader(file) | |
text = "" | |
for page_num in range(pdf.getNumPages()): | |
text += pdf.getPage(page_num).extractText() | |
return text | |
def tokenize(text,model="gpt-3.5-turbo"): | |
tokenizer = tiktoken.encoding_for_model(model) | |
tokens = tokenizer.encode( | |
text, | |
disallowed_special=() | |
) | |
return tokens | |
def count_tokens(text): | |
return len(tokenize(text)) | |
def count_tokens_in_file(file): | |
# Extract text from the PDF file | |
paper_text = extract_text_from_pdf(file.name) | |
return count_tokens(paper_text) | |
def chunk_text(text, max_char, overlap): | |
chunks = [] | |
start = 0 | |
end = max_char | |
print(f"max char: {max_char}") | |
while start < len(text): | |
if end >= len(text): | |
end = len(text) | |
chunk = text[start:end] | |
print(f"chunk[{start}:{end}] size: {count_tokens(chunk)} tokens") | |
chunks.append(chunk) | |
start += max_char - overlap | |
end = start + max_char | |
return chunks | |
def chunk_file(file, max_char,overlap): | |
# Extract text from the PDF file | |
text = extract_text_from_pdf(file.name) | |
chunks = chunk_text(text, max_char, overlap) | |
return '\n\n[xxxxxxxxxxxxxxxxx]\n\n'.join(chunks) | |
with gr.Blocks() as demo: | |
gr.Markdown("Upload your document to count their tokens") | |
with gr.Tab("Upload PDF"): | |
docs_input = gr.File(file_count="single", file_types=[".pdf"]) | |
tb_tokenCount = gr.Textbox(label='Number of tokens') | |
docs_input.upload(count_tokens_in_file,inputs=[docs_input],outputs=[tb_tokenCount]) | |
sl_max_char_per_chunk = gr.Slider(1000, 30000, value=2000, label="Number of characters", info="Choose a number of characters per chunk") | |
sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size") | |
btn_chunk = gr.Button("Chunk text") | |
tb_chunked_text = gr.Textbox(label='Result') | |
btn_chunk.click(chunk_file,inputs=[docs_input,sl_max_char_per_chunk,sl_overlap],outputs=[tb_chunked_text]) | |
with gr.Tab("Text"): | |
text_input = gr.Textbox(label='Insert your text here') | |
text_tb_tokenCount = gr.Textbox(label='Number of tokens') | |
text_input.change(count_tokens,inputs=[text_input],outputs=[text_tb_tokenCount]) | |
text_sl_max_char_per_chunk = gr.Slider(1000, 30000, value=2000, label="Number of characters", info="Choose a number of characters per chunk") | |
text_sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size") | |
text_btn_chunk = gr.Button("Chunk text") | |
text_tb_chunked_text = gr.Textbox(label='Result') | |
def format_chunks(text,max_char,overlap): | |
return '\n\n[xxxxxxxxxxxxxxxx]\n\n'.join(chunk_text(text,max_char,overlap)) | |
text_btn_chunk.click(format_chunks, | |
inputs=[text_input,text_sl_max_char_per_chunk,text_sl_overlap], | |
outputs=[text_tb_chunked_text]) | |
#demo.queue() | |
demo.launch(debug=True,share=False) |