Spaces:
Sleeping
Sleeping
File size: 1,717 Bytes
4ee33aa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
from pathlib import Path
import torch
import os
from tqdm import tqdm
from ttts.utils.utils import get_paths_with_cache
os.environ["MODELSCOPE_CACHE"] = "./"
def phase1_vad_and_sample(file_paths, out_path, max_workers):
paths = [[file_path, out_path] for file_path in file_paths]
with torch.multiprocessing.get_context("spawn").Pool(max_workers) as pool:
results = list(tqdm(pool.imap(process_file_vad, paths), total=len(file_paths), desc="VAD"))
results = [result for result in results if result is not None]
def phase2_filter_and_transcript_and_to_jsonl(file_paths, out_path, max_workers):
paths = [[file_path, out_path] for file_path in file_paths]
with torch.multiprocessing.get_context("spawn").Pool(max_workers) as pool:
results = list(tqdm(pool.imap(process_file_asr, paths), total=len(file_paths), desc="ASR"))
results = [result for result in results if result is not None]
if __name__ == '__main__':
# phase 1
from ttts.prepare.vad_process import process_file_vad
print("---------------phase1-----------------")
# files = get_paths_with_cache('ttts/datasets/raw_datasets', 'ttts/datasets/wav_paths.cache')
files = get_paths_with_cache('ttts/datasets/raw_datasets/databaker/Wave')
out_path = 'ttts/datasets/databaker_clips'
Path(out_path).mkdir(exist_ok = True, parents=True)
phase1_vad_and_sample(files, out_path, 8)
# phase 2
from ttts.prepare.asr_process import process_file_asr
print("---------------phase2-----------------")
files = get_paths_with_cache('ttts/datasets/databaker_clips')
out_path = 'ttts/datasets/databaker_data.jsonl'
phase2_filter_and_transcript_and_to_jsonl(files, out_path, 8)
|