import gradio as gr
from PyPDF4 import PdfFileReader
import tiktoken

def extract_text_from_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf = PdfFileReader(file)
        text = ""
        for page_num in range(pdf.getNumPages()):
            text += pdf.getPage(page_num).extractText()
    return text

def tokenize(text,model="gpt-3.5-turbo"):
    tokenizer = tiktoken.encoding_for_model(model)
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return tokens

def count_tokens(text):
    return len(tokenize(text))

def count_tokens_in_file(file):
    # Extract text from the PDF file
    paper_text = extract_text_from_pdf(file.name)
    return count_tokens(paper_text)

def chunk_text(text, max_char, overlap):
    chunks = []
    start = 0
    end = max_char
    print(f"max char: {max_char}")
    while start < len(text):
        if end >= len(text):
            end = len(text)
        chunk = text[start:end]
        print(f"chunk[{start}:{end}] size: {count_tokens(chunk)} tokens")
        chunks.append(chunk)
        start += max_char - overlap
        end = start + max_char
    return chunks

def chunk_file(file, max_char,overlap):
    # Extract text from the PDF file
    text = extract_text_from_pdf(file.name)
    chunks = chunk_text(text, max_char, overlap)

    return '\n\n[xxxxxxxxxxxxxxxxx]\n\n'.join(chunks)

with gr.Blocks() as demo:
    gr.Markdown("Upload your document to count their tokens")
    with gr.Tab("Upload PDF"): 
        docs_input = gr.File(file_count="single", file_types=[".pdf"])
        tb_tokenCount = gr.Textbox(label='Number of tokens')
        docs_input.upload(count_tokens_in_file,inputs=[docs_input],outputs=[tb_tokenCount])
        sl_max_char_per_chunk = gr.Slider(1000, 30000, value=2000, label="Number of characters", info="Choose a number of characters per chunk")
        sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size")

        btn_chunk = gr.Button("Chunk text")
        tb_chunked_text = gr.Textbox(label='Result')
        
        btn_chunk.click(chunk_file,inputs=[docs_input,sl_max_char_per_chunk,sl_overlap],outputs=[tb_chunked_text])
    with gr.Tab("Text"): 
        text_input = gr.Textbox(label='Insert your text here')
        text_tb_tokenCount = gr.Textbox(label='Number of tokens')
        text_input.change(count_tokens,inputs=[text_input],outputs=[text_tb_tokenCount])
        text_sl_max_char_per_chunk = gr.Slider(1000, 30000, value=2000, label="Number of characters", info="Choose a number of characters per chunk")
        text_sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size")

        text_btn_chunk = gr.Button("Chunk text")
        text_tb_chunked_text = gr.Textbox(label='Result')
        def format_chunks(text,max_char,overlap):
            return '\n\n[xxxxxxxxxxxxxxxx]\n\n'.join(chunk_text(text,max_char,overlap))
        text_btn_chunk.click(format_chunks,
                             inputs=[text_input,text_sl_max_char_per_chunk,text_sl_overlap],
                             outputs=[text_tb_chunked_text])

#demo.queue()
demo.launch(debug=True,share=False)