Spaces:

FunAudioLLM
/

InspireMusic

Running on Zero

InspireMusic / inspiremusic /utils /data_utils.py

chong.zhang

update

96fe5d9 about 2 months ago

4.08 kB

	# Copyright (c) 2024 Alibaba Inc
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	from torch.utils.data import DataLoader
	from inspiremusic.dataset.dataset import Dataset
	import numpy as np
	import librosa

	def audio_process_dataset_and_dataloader(args, configs):
	input_dataset = Dataset(args.input_data, data_pipeline=configs['data_pipeline'], mode='processing', shuffle=True, partition=True)
	# do not use persistent_workers=True, as whisper tokenizer opens tiktoken file each time when the for loop starts
	input_data_loader = DataLoader(input_dataset,
	batch_size=None,
	pin_memory=args.pin_memory,
	num_workers=args.num_workers,
	prefetch_factor=args.prefetch)
	return input_dataset, input_data_loader

	def is_silent(wav_path, threshold=0.01, frame_length=2048, hop_length=512):
	y, sr = librosa.load(wav_path, sr=None)
	rms = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length)[0]
	silent_frames = np.sum(rms < threshold) / len(rms)
	silence_fraction_threshold = 0.95
	return silent_frames >= silence_fraction_threshold

	def rich_captions(text=None, tags=None, lyrics=None, chorus="verse", start_time=0.0, end_time=30.0):
	if text is None and tags is None and lyrics is None:
	return None
	else:
	if start_time is None:
	start_time = 0.0
	if end_time is None:
	end_time = 30.0
	if chorus is None:
	chorus = "verse"
	captions = f"<\|{start_time:.1f}\|><\|{chorus}\|>"
	if tags is not None:
	captions += f"<\|{tags}\|>"
	if text is not None:
	captions += f"<\|{text}\|>"
	if lyrics is not None:
	captions += f"<\|lyrics\|><\|{lyrics}\|>"
	captions += f"<\|{end_time:.1f}\|>"
	return captions

	def process_tags(infile, outfile, timefile = None):
	key_list = []
	with open(infile, "r") as f:
	for line in f:
	sec = line.strip()
	key_list.append(sec)
	f.close()
	if timefile is None:
	with open(outfile, 'w') as f:
	for k in key_list:
	parts = k.rsplit('_', 1)
	text = parts[0].replace('_', ' ') + ', ' + parts[1]
	caption = rich_captions(text, None, None)
	if caption is not None:
	f.write("%s\t%s\n" %(k, caption))
	f.close()
	else:
	times = {}
	with open(timefile, "r") as f:
	for line in f:
	sec = line.strip().split("\t")
	if len(sec) == 2 :
	times[sec[0]] = sec[1]
	f.close()

	with open(outfile, 'w') as f:
	for k in key_list:
	parts = k.rsplit('_', 1)
	text = parts[0].replace('_', ' ') + ', ' + parts[1]
	if k in times.keys():
	caption = rich_captions(text, None, None, "verse", 0.0, float(times[k]))
	if caption is not None:
	f.write("%s\t%s\n" %(k, caption))
	f.close()

	def process_trans(infile, outfile):
	trans = {}
	with open(infile, "r") as f:
	for line in f:
	sec = line.strip().split("\t")
	if len(sec) == 2:
	trans[sec[0]] = sec[1]
	else:
	print(line)
	f.close()
	with open(outfile, 'w') as f:
	for k, v in trans.items():
	f.write("%s\t%s\n" %(k, rich_captions(v)))
	f.close()