Spaces:
Sleeping
Sleeping
from pathlib import Path | |
import torch | |
import os | |
from tqdm import tqdm | |
from ttts.utils.utils import get_paths_with_cache | |
os.environ["MODELSCOPE_CACHE"] = "./" | |
def phase1_vad_and_sample(file_paths, out_path, max_workers): | |
paths = [[file_path, out_path] for file_path in file_paths] | |
with torch.multiprocessing.get_context("spawn").Pool(max_workers) as pool: | |
results = list(tqdm(pool.imap(process_file_vad, paths), total=len(file_paths), desc="VAD")) | |
results = [result for result in results if result is not None] | |
def phase2_filter_and_transcript_and_to_jsonl(file_paths, out_path, max_workers): | |
paths = [[file_path, out_path] for file_path in file_paths] | |
with torch.multiprocessing.get_context("spawn").Pool(max_workers) as pool: | |
results = list(tqdm(pool.imap(process_file_asr, paths), total=len(file_paths), desc="ASR")) | |
results = [result for result in results if result is not None] | |
if __name__ == '__main__': | |
# phase 1 | |
from ttts.prepare.vad_process import process_file_vad | |
print("---------------phase1-----------------") | |
# files = get_paths_with_cache('ttts/datasets/raw_datasets', 'ttts/datasets/wav_paths.cache') | |
files = get_paths_with_cache('ttts/datasets/raw_datasets/databaker/Wave') | |
out_path = 'ttts/datasets/databaker_clips' | |
Path(out_path).mkdir(exist_ok = True, parents=True) | |
phase1_vad_and_sample(files, out_path, 8) | |
# phase 2 | |
from ttts.prepare.asr_process import process_file_asr | |
print("---------------phase2-----------------") | |
files = get_paths_with_cache('ttts/datasets/databaker_clips') | |
out_path = 'ttts/datasets/databaker_data.jsonl' | |
phase2_filter_and_transcript_and_to_jsonl(files, out_path, 8) | |