import os
import zipfile
import gradio as gr
from PyPDF4 import PdfFileReader
import tiktoken

def extract_text_from_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf = PdfFileReader(file)
        text = ""
        for page_num in range(pdf.getNumPages()):
            text += pdf.getPage(page_num).extractText()
    return text

def tokenize(text, model="gpt-3.5-turbo"):
    tokenizer = tiktoken.encoding_for_model(model)
    tokens = tokenizer.encode(text, disallowed_special=())
    return tokens

def count_tokens(text):
    return len(tokenize(text))

def analyse_text(text):
    num_tokens = count_tokens(text)
    result = []
    try:
        result.append(f"Text length: {len(text)}")
        result.append(f"Token counts: {num_tokens}")
        result.append(f"Char per token: {'%.1f' % (len(text)/num_tokens)}")
    except:
        result = 'no text'
    return '\n'.join(result)

def analyse_file(file):
    paper_text = extract_text_from_pdf(file.name)
    return paper_text

def write_chunks_to_files(chunks):
    file_paths = []
    for i, chunk in enumerate(chunks, start=1):
        file_path = f"chunk_{i}.txt"
        with open(file_path, "w") as file:
            file.write(chunk)
        file_paths.append(file_path)
    return file_paths

def write_chunks_to_zip(chunks):
    file_paths = write_chunks_to_files(chunks)
    zip_file_name = "chunks.zip"
    with zipfile.ZipFile(zip_file_name, 'w') as zipf:
        for file in file_paths:
            zipf.write(file)
            os.remove(file)  # Remove the file after writing it into the zip
    return zip_file_name

def chunk_text(text, max_char, overlap):
    chunks = []
    start = 0
    end = max_char
    while start < len(text):
        if end >= len(text):
            end = len(text)
        chunk = text[start:end]
        num_tokens = count_tokens(chunk)
        chunks.append((chunk, len(chunk), num_tokens))
        start += max_char - overlap
        end = start + max_char
    return chunks

def chunk_file(file, max_char, overlap):
    text = extract_text_from_pdf(file.name)
    chunks = chunk_text(text, max_char, overlap)
    formatted_chunks = [f"Chunk[{i}]: Size: {len(c[0])} chars, {c[2]} tokens" for i, c in enumerate(chunks, start=1)]
    zip_file_path = write_chunks_to_zip([c[0] for c in chunks])
    return '\n'.join(formatted_chunks), zip_file_path

def chunk_and_zip_text(text, max_char, overlap):
    chunks = chunk_text(text, max_char, overlap)
    formatted_chunks = [f"Chunk[{i}]: Size: {len(c[0])} chars, {c[2]} tokens" for i, c in enumerate(chunks, start=1)]
    zip_file_path = write_chunks_to_zip([c[0] for c in chunks])
    return '\n'.join(formatted_chunks), zip_file_path

with gr.Blocks() as demo:
    docs_input = gr.File(file_count="single", file_types=[".pdf"])
    text_to_chunk = gr.Textbox(label='Text to chunk',show_copy_button=True)
    tb_analysis = gr.Textbox(label='Text Analysis')
    sl_max_char_per_chunk = gr.Slider(1000, 300000, value=10000, label="Number of characters", info="Choose a number of characters per chunk")
    sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size")
    btn_chunk = gr.Button("Chunk text")
    tb_chunked_text = gr.Textbox(label='Chunks Info')
    download_link = gr.File(label='Download Chunks')

    # Call analyse_file when a file is uploaded and display the results in tb_analysis
    docs_input.upload(analyse_file,inputs=[docs_input], outputs=[text_to_chunk])
    text_to_chunk.change(analyse_text,inputs=[text_to_chunk],outputs=[tb_analysis])

    btn_chunk.click(chunk_and_zip_text, inputs=[text_to_chunk, sl_max_char_per_chunk, sl_overlap], outputs=[tb_chunked_text, download_link])

demo.launch(debug=True, share=False)