Spaces:
Running
Running
File size: 2,246 Bytes
b5979c9 f499cc7 b5979c9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import os
import sys
sys.path.append(os.getcwd())
import json
from importlib.resources import files
from pathlib import Path
from tqdm import tqdm
import soundfile as sf
from datasets.arrow_writer import ArrowWriter
def main():
result = []
duration_list = []
text_vocab_set = set()
with open(meta_info, "r") as f:
lines = f.readlines()
for line in tqdm(lines):
uttr, text, norm_text = line.split("|")
norm_text = norm_text.strip()
wav_path = Path(dataset_dir) / "wavs" / f"{uttr}.wav"
duration = sf.info(wav_path).duration
if duration < 0.4 or duration > 30:
continue
result.append({"audio_path": str(wav_path), "text": norm_text, "duration": duration})
duration_list.append(duration)
text_vocab_set.update(list(norm_text))
# save preprocessed dataset to disk
if not os.path.exists(f"{save_dir}"):
os.makedirs(f"{save_dir}")
print(f"\nSaving to {save_dir} ...")
with ArrowWriter(path=f"{save_dir}/raw.arrow") as writer:
for line in tqdm(result, desc="Writing to raw.arrow ..."):
writer.write(line)
# dup a json separately saving duration in case for DynamicBatchSampler ease
with open(f"{save_dir}/duration.json", "w", encoding="utf-8") as f:
json.dump({"duration": duration_list}, f, ensure_ascii=False)
# vocab map, i.e. tokenizer
# add alphabets and symbols (optional, if plan to ft on de/fr etc.)
with open(f"{save_dir}/vocab.txt", "w") as f:
for vocab in sorted(text_vocab_set):
f.write(vocab + "\n")
print(f"\nFor {dataset_name}, sample count: {len(result)}")
print(f"For {dataset_name}, vocab size is: {len(text_vocab_set)}")
print(f"For {dataset_name}, total {sum(duration_list)/3600:.2f} hours")
if __name__ == "__main__":
tokenizer = "char" # "pinyin" | "char"
dataset_dir = "<SOME_PATH>/LJSpeech-1.1"
dataset_name = f"LJSpeech_{tokenizer}"
meta_info = os.path.join(dataset_dir, "metadata.csv")
save_dir = str(files("f5_tts").joinpath("../../")) + f"/data/{dataset_name}"
print(f"\nPrepare for {dataset_name}, will save to {save_dir}\n")
main()
|