docs2datasets / app.py
davanstrien's picture
davanstrien HF staff
add state to app
907b541
raw
history blame
8.55 kB
import logging
from pathlib import Path
import gradio as gr
from datasets import Dataset
from gradio_log import Log
from huggingface_hub import DatasetCard
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import MetadataMode
from tqdm.auto import tqdm
log_file = "logs.txt"
Path(log_file).touch(exist_ok=True)
logging.basicConfig(filename="logs.txt", level=logging.INFO)
logging.getLogger().addHandler(logging.FileHandler(log_file))
def load_corpus(
files, chunk_size=256, chunk_overlap=0, verbose=True, split_sentences=True
):
if verbose:
gr.Info("Loading files...")
reader = SimpleDirectoryReader(input_files=files)
docs = reader.load_data()
if split_sentences is False:
gr.Info(
"Skipping sentence splitting. Each file will be a single row in the dataset."
)
return {doc.id_: doc.text for doc in docs}
if split_sentences:
return split_corpus(verbose, docs, chunk_size, chunk_overlap)
def split_corpus(verbose, docs, chunk_size, chunk_overlap):
if verbose:
gr.Info(f"Loaded {len(docs)} docs")
parser = SentenceSplitter.from_defaults(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)
if verbose:
gr.Info(f"Parsed {len(nodes)} nodes")
docs = {
node.node_id: node.get_content(metadata_mode=MetadataMode.NONE)
for node in tqdm(nodes)
}
# remove empty docs
docs = {k: v for k, v in docs.items() if v}
return docs
def upload_and_preview(
files,
chunk_size: int = 256,
chunk_overlap: int = 0,
split_sentences: bool = True,
):
print("loading files")
file_paths = [file.name for file in files]
print("parsing into sentences")
corpus = load_corpus(
file_paths,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
split_sentences=split_sentences,
)
gr.Info("Creating dataset")
dataset = Dataset.from_dict({"ids": corpus.keys(), "texts": corpus.values()})
message = f"Files uploaded and dataset preview created:\n - {len(dataset)} rows"
state = {
"file_paths": file_paths,
"dataset": dataset,
"chunk_size": chunk_size,
"chunk_overlap": chunk_overlap,
}
return state, dataset.to_pandas(), message
def preview_dataset(
state,
chunk_size: int = 256,
chunk_overlap: int = 0,
split_sentences: bool = True,
):
if not state.get("file_paths"):
raise gr.Error("Please upload files first.")
print("parsing into sentences")
corpus = load_corpus(
state["file_paths"],
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
split_sentences=split_sentences,
)
print("Creating dataset")
dataset = Dataset.from_dict({"ids": corpus.keys(), "texts": corpus.values()})
message = f"Dataset preview updated:\n - {len(dataset)} rows"
state["dataset"] = dataset
state["chunk_size"] = chunk_size
state["chunk_overlap"] = chunk_overlap
return state, dataset.to_pandas(), message
def upload_to_hub(
state,
hub_id: str = None,
private: bool = False,
oauth_token: gr.OAuthToken = None,
):
if not state.get("dataset"):
raise gr.Error("Please preview the dataset first.")
dataset = state["dataset"]
chunk_size = state["chunk_size"]
chunk_overlap = state["chunk_overlap"]
message = f"Dataset has: \n - {len(dataset)} rows"
if hub_id:
if oauth_token is not None:
gr.Info("Uploading dataset to the Hugging Face Hub...")
dataset.push_to_hub(hub_id, token=oauth_token.token, private=private)
update_dataset_card(hub_id, oauth_token.token, chunk_size, chunk_overlap)
message += (
f"\n\nUploaded to [{hub_id}](https://huggingface.co/datasets/{hub_id})"
)
else:
raise gr.Error("Please login to Hugging Face Hub to push to hub")
return message
def update_dataset_card(
hub_id,
token,
chunk_size,
chunk_overlap,
):
card = DatasetCard.load(hub_id, token=token)
if not card.text:
# add template description to card text
card.text += f"""This dataset was created using [Corpus Creator](https://huggingface.co/spaces/davanstrien/corpus-creator). This dataset was created by parsing a corpus of text files into chunks of sentences using Llama Index.
This processing was done with a chunk size of {chunk_size} and a chunk overlap of {chunk_overlap}."""
tags = card.data.get("tags", [])
tags.append("corpus-creator")
card.data["tags"] = tags
card.push_to_hub(hub_id, token=token)
description = """
Corpus Creator is a tool designed to help you easily convert a collection of text files into a dataset suitable for various natural language processing (NLP) tasks.
In particular the app is focused on splitting texts into chunks of a specified size and overlap. This can be useful for preparing data for synthetic data generation, pipelines or annotation tasks.
See an [example dataset](https://huggingface.co/datasets/davanstrien/MOH-Bethnal-Green) created using this tool starting from a collection of plain text files.
The resulting text chunks are stored in a dataset that can be previewed and uploaded to the Hugging Face Hub for easy sharing and access by the community.
The chunking is done using `Llama-index`'s [`SentenceSplitter`](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/?h=sentencesplitter#sentencesplitter) classes.
### Usage:
1. Upload Files: Use the upload button to load file(s) for processing. A preview will be automatically generated using default settings.
2. Adjust Parameters (Optional): Customize the chunk size, overlap, and sentence splitting option according to your requirements.
3. Update Preview (Optional): Click the 'Update Preview' button to view the updated dataset based on your parameter changes.
4. Login: When ready to upload, log in to your Hugging Face account using the provided login button.
5. Upload to Hub: Specify the Hub ID, choose whether to make the dataset private, and click 'Upload to Hub'."""
with gr.Blocks() as demo:
state = gr.State({})
gr.HTML(
"""<h1 style='text-align: center;'> Corpus Creator</h1>
<center><i> &#128193; From random files to a Hugging Face dataset in a few steps &#128193; </i></center>"""
)
gr.Markdown(description)
with gr.Row():
upload_button = gr.File(
file_types=["text"],
file_count="multiple",
height=50,
interactive=True,
label="Upload Files",
)
with gr.Row():
split_sentences = gr.Checkbox(True, label="Split sentences?")
chunk_size = gr.Number(
256,
label="Chunk size (size to split text into)",
minimum=10,
maximum=4096,
step=1,
)
chunk_overlap = gr.Number(
0,
label="Chunk overlap (overlap size between chunks)",
minimum=0,
maximum=4096,
step=1,
)
update_preview_button = gr.Button("Update Preview")
corpus_preview_df = gr.DataFrame(label="Dataset Preview")
preview_summary = gr.Markdown()
with gr.Row():
gr.LoginButton()
with gr.Column():
gr.Markdown(
"To upload to the Hub, add an ID for where you want to push the dataset"
)
hub_id = gr.Textbox(value=None, label="Hub ID")
private = gr.Checkbox(False, label="Upload dataset to a private repo?")
upload_hub_button = gr.Button("Upload to Hub")
upload_summary = gr.Markdown()
with gr.Accordion("detailed logs", open=False):
Log(log_file, dark=True, xterm_font_size=12)
upload_button.upload(
upload_and_preview,
inputs=[upload_button, chunk_size, chunk_overlap, split_sentences],
outputs=[state, corpus_preview_df, preview_summary],
)
update_preview_button.click(
preview_dataset,
inputs=[state, chunk_size, chunk_overlap, split_sentences],
outputs=[state, corpus_preview_df, preview_summary],
)
upload_hub_button.click(
upload_to_hub,
inputs=[state, hub_id, private],
outputs=[upload_summary],
)
demo.launch(debug=True)