import logging from pathlib import Path import gradio as gr from datasets import Dataset from gradio_log import Log from huggingface_hub import DatasetCard from llama_index.core import SimpleDirectoryReader from llama_index.core.node_parser import SentenceSplitter from llama_index.core.schema import MetadataMode from tqdm.auto import tqdm log_file = "logs.txt" Path(log_file).touch(exist_ok=True) logging.basicConfig(filename="logs.txt", level=logging.INFO) logging.getLogger().addHandler(logging.FileHandler(log_file)) def load_corpus( files, chunk_size=256, chunk_overlap=0, verbose=True, split_sentences=True ): if verbose: gr.Info("Loading files...") reader = SimpleDirectoryReader(input_files=files) docs = reader.load_data() if split_sentences is False: gr.Info( "Skipping sentence splitting. Each file will be a single row in the dataset." ) return {doc.id_: doc.text for doc in docs} if split_sentences: return split_corpus(verbose, docs, chunk_size, chunk_overlap) def split_corpus(verbose, docs, chunk_size, chunk_overlap): if verbose: gr.Info(f"Loaded {len(docs)} docs") parser = SentenceSplitter.from_defaults( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) nodes = parser.get_nodes_from_documents(docs, show_progress=verbose) if verbose: gr.Info(f"Parsed {len(nodes)} nodes") docs = { node.node_id: node.get_content(metadata_mode=MetadataMode.NONE) for node in tqdm(nodes) } # remove empty docs docs = {k: v for k, v in docs.items() if v} return docs def upload_and_preview( files, chunk_size: int = 256, chunk_overlap: int = 0, split_sentences: bool = True, ): print("loading files") file_paths = [file.name for file in files] print("parsing into sentences") corpus = load_corpus( file_paths, chunk_size=chunk_size, chunk_overlap=chunk_overlap, split_sentences=split_sentences, ) gr.Info("Creating dataset") dataset = Dataset.from_dict({"ids": corpus.keys(), "texts": corpus.values()}) message = f"Files uploaded and dataset preview created:\n - {len(dataset)} rows" state = { "file_paths": file_paths, "dataset": dataset, "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, } return state, dataset.to_pandas(), message def preview_dataset( state, chunk_size: int = 256, chunk_overlap: int = 0, split_sentences: bool = True, ): if not state.get("file_paths"): raise gr.Error("Please upload files first.") print("parsing into sentences") corpus = load_corpus( state["file_paths"], chunk_size=chunk_size, chunk_overlap=chunk_overlap, split_sentences=split_sentences, ) print("Creating dataset") dataset = Dataset.from_dict({"ids": corpus.keys(), "texts": corpus.values()}) message = f"Dataset preview updated:\n - {len(dataset)} rows" state["dataset"] = dataset state["chunk_size"] = chunk_size state["chunk_overlap"] = chunk_overlap return state, dataset.to_pandas(), message def upload_to_hub( state, hub_id: str = None, private: bool = False, oauth_token: gr.OAuthToken = None, ): if not state.get("dataset"): raise gr.Error("Please preview the dataset first.") dataset = state["dataset"] chunk_size = state["chunk_size"] chunk_overlap = state["chunk_overlap"] message = f"Dataset has: \n - {len(dataset)} rows" if hub_id: if oauth_token is not None: gr.Info("Uploading dataset to the Hugging Face Hub...") dataset.push_to_hub(hub_id, token=oauth_token.token, private=private) update_dataset_card(hub_id, oauth_token.token, chunk_size, chunk_overlap) message += ( f"\n\nUploaded to [{hub_id}](https://huggingface.co/datasets/{hub_id})" ) else: raise gr.Error("Please login to Hugging Face Hub to push to hub") return message def update_dataset_card( hub_id, token, chunk_size, chunk_overlap, ): card = DatasetCard.load(hub_id, token=token) if not card.text: # add template description to card text card.text += f"""This dataset was created using [Corpus Creator](https://huggingface.co/spaces/davanstrien/corpus-creator). This dataset was created by parsing a corpus of text files into chunks of sentences using Llama Index. This processing was done with a chunk size of {chunk_size} and a chunk overlap of {chunk_overlap}.""" tags = card.data.get("tags", []) tags.append("corpus-creator") card.data["tags"] = tags card.push_to_hub(hub_id, token=token) description = """ Corpus Creator is a tool designed to help you easily convert a collection of text files into a dataset suitable for various natural language processing (NLP) tasks. In particular the app is focused on splitting texts into chunks of a specified size and overlap. This can be useful for preparing data for synthetic data generation, pipelines or annotation tasks. See an [example dataset](https://huggingface.co/datasets/davanstrien/MOH-Bethnal-Green) created using this tool starting from a collection of plain text files. The resulting text chunks are stored in a dataset that can be previewed and uploaded to the Hugging Face Hub for easy sharing and access by the community. The chunking is done using `Llama-index`'s [`SentenceSplitter`](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/?h=sentencesplitter#sentencesplitter) classes. ### Usage: 1. Upload Files: Use the upload button to load file(s) for processing. A preview will be automatically generated using default settings. 2. Adjust Parameters (Optional): Customize the chunk size, overlap, and sentence splitting option according to your requirements. 3. Update Preview (Optional): Click the 'Update Preview' button to view the updated dataset based on your parameter changes. 4. Login: When ready to upload, log in to your Hugging Face account using the provided login button. 5. Upload to Hub: Specify the Hub ID, choose whether to make the dataset private, and click 'Upload to Hub'.""" with gr.Blocks() as demo: state = gr.State({}) gr.HTML( """

Corpus Creator

📁 From random files to a Hugging Face dataset in a few steps 📁
""" ) gr.Markdown(description) with gr.Row(): upload_button = gr.File( file_types=["text"], file_count="multiple", height=50, interactive=True, label="Upload Files", ) with gr.Row(): split_sentences = gr.Checkbox(True, label="Split sentences?") chunk_size = gr.Number( 256, label="Chunk size (size to split text into)", minimum=10, maximum=4096, step=1, ) chunk_overlap = gr.Number( 0, label="Chunk overlap (overlap size between chunks)", minimum=0, maximum=4096, step=1, ) update_preview_button = gr.Button("Update Preview") corpus_preview_df = gr.DataFrame(label="Dataset Preview") preview_summary = gr.Markdown() with gr.Row(): gr.LoginButton() with gr.Column(): gr.Markdown( "To upload to the Hub, add an ID for where you want to push the dataset" ) hub_id = gr.Textbox(value=None, label="Hub ID") private = gr.Checkbox(False, label="Upload dataset to a private repo?") upload_hub_button = gr.Button("Upload to Hub") upload_summary = gr.Markdown() with gr.Accordion("detailed logs", open=False): Log(log_file, dark=True, xterm_font_size=12) upload_button.upload( upload_and_preview, inputs=[upload_button, chunk_size, chunk_overlap, split_sentences], outputs=[state, corpus_preview_df, preview_summary], ) update_preview_button.click( preview_dataset, inputs=[state, chunk_size, chunk_overlap, split_sentences], outputs=[state, corpus_preview_df, preview_summary], ) upload_hub_button.click( upload_to_hub, inputs=[state, hub_id, private], outputs=[upload_summary], ) demo.launch(debug=True)