Spaces:

AnhP
/

RVC-GUI

Running

App Files Files Community

RVC-GUI / main /app /core /inference.py

AnhP

Upload 170 files

1e4a2ab verified 8 days ago

raw

history blame

24.5 kB

	import os
	import re
	import sys
	import shutil
	import librosa
	import datetime
	import subprocess

	import numpy as np

	sys.path.append(os.getcwd())

	from main.app.core.ui import gr_info, gr_warning, gr_error, process_output
	from main.app.variables import logger, config, configs, translations, python

	def convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0_method, input_path, output_path, pth_path, index_path, f0_autotune, clean_audio, clean_strength, export_format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, f0_onnx, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold):
	if config.debug_mode: subprocess.run([python, configs["convert_path"], "--pitch", str(pitch), "--filter_radius", str(filter_radius), "--index_rate", str(index_rate), "--rms_mix_rate", str(rms_mix_rate), "--protect", str(protect), "--hop_length", str(hop_length), "--f0_method", f0_method, "--input_path", input_path, "--output_path", output_path, "--pth_path", pth_path, "--index_path", index_path, "--f0_autotune", str(f0_autotune), "--clean_audio", str(clean_audio), "--clean_strength", str(clean_strength), "--export_format", export_format, "--embedder_model", embedder_model, "--resample_sr", str(resample_sr), "--split_audio", str(split_audio), "--f0_autotune_strength", str(f0_autotune_strength), "--checkpointing", str(checkpointing), "--f0_onnx", str(f0_onnx), "--embedders_mode", embedders_mode, "--formant_shifting", str(formant_shifting), "--formant_qfrency", str(formant_qfrency), "--formant_timbre", str(formant_timbre), "--f0_file", f0_file, "--proposal_pitch", str(proposal_pitch), "--proposal_pitch_threshold", str(proposal_pitch_threshold)])
	else:
	from main.inference.conversion.convert import run_convert_script

	run_convert_script(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0_method, input_path, output_path, pth_path, index_path, f0_autotune, f0_autotune_strength, clean_audio, clean_strength, export_format, embedder_model, resample_sr, split_audio, checkpointing, f0_file, f0_onnx, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, proposal_pitch, proposal_pitch_threshold)

	def convert_audio(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, input_audio_name, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold):
	model_path = os.path.join(configs["weights_path"], model) if not os.path.exists(model) else model

	return_none = [None]*6
	return_none[5] = {"visible": True, "__type__": "update"}

	if not use_audio:
	if merge_instrument or not_merge_backing or convert_backing or use_original:
	gr_warning(translations["turn_on_use_audio"])
	return return_none

	if use_original:
	if convert_backing:
	gr_warning(translations["turn_off_convert_backup"])
	return return_none
	elif not_merge_backing:
	gr_warning(translations["turn_off_merge_backup"])
	return return_none

	if not model or not os.path.exists(model_path) or os.path.isdir(model_path) or not model.endswith((".pth", ".onnx")):
	gr_warning(translations["provide_file"].format(filename=translations["model"]))
	return return_none

	f0method, embedder_model = (method if method != "hybrid" else hybrid_method), (embedders if embedders != "custom" else custom_embedders)

	if use_audio:
	output_audio = os.path.join(configs["audios_path"], input_audio_name)

	from main.library.utils import pydub_load

	def get_audio_file(label):
	matching_files = [f for f in os.listdir(output_audio) if label in f]

	if not matching_files: return translations["notfound"]
	return os.path.join(output_audio, matching_files[0])

	output_path = os.path.join(output_audio, f"Convert_Vocals.{format}")
	output_backing = os.path.join(output_audio, f"Convert_Backing.{format}")
	output_merge_backup = os.path.join(output_audio, f"Vocals+Backing.{format}")
	output_merge_instrument = os.path.join(output_audio, f"Vocals+Instruments.{format}")

	if os.path.exists(output_audio): os.makedirs(output_audio, exist_ok=True)
	output_path = process_output(output_path)

	if use_original:
	original_vocal = get_audio_file('Original_Vocals_No_Reverb.')

	if original_vocal == translations["notfound"]: original_vocal = get_audio_file('Original_Vocals.')

	if original_vocal == translations["notfound"]:
	gr_warning(translations["not_found_original_vocal"])
	return return_none

	input_path = original_vocal
	else:
	main_vocal = get_audio_file('Main_Vocals_No_Reverb.')
	backing_vocal = get_audio_file('Backing_Vocals_No_Reverb.')

	if main_vocal == translations["notfound"]: main_vocal = get_audio_file('Main_Vocals.')
	if not not_merge_backing and backing_vocal == translations["notfound"]: backing_vocal = get_audio_file('Backing_Vocals.')

	if main_vocal == translations["notfound"]:
	gr_warning(translations["not_found_main_vocal"])
	return return_none

	if not not_merge_backing and backing_vocal == translations["notfound"]:
	gr_warning(translations["not_found_backing_vocal"])
	return return_none

	input_path = main_vocal
	backing_path = backing_vocal

	gr_info(translations["convert_vocal"])

	convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, input_path, output_path, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold)

	gr_info(translations["convert_success"])

	if convert_backing:
	output_backing = process_output(output_backing)

	gr_info(translations["convert_backup"])

	convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, backing_path, output_backing, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold)

	gr_info(translations["convert_backup_success"])

	try:
	if not not_merge_backing and not use_original:
	backing_source = output_backing if convert_backing else backing_vocal

	output_merge_backup = process_output(output_merge_backup)

	gr_info(translations["merge_backup"])

	pydub_load(output_path, volume=-4).overlay(pydub_load(backing_source, volume=-6)).export(output_merge_backup, format=format)

	gr_info(translations["merge_success"])

	if merge_instrument:
	vocals = output_merge_backup if not not_merge_backing and not use_original else output_path

	output_merge_instrument = process_output(output_merge_instrument)

	gr_info(translations["merge_instruments_process"])

	instruments = get_audio_file('Instruments.')

	if instruments == translations["notfound"]:
	gr_warning(translations["not_found_instruments"])
	output_merge_instrument = None
	else: pydub_load(instruments, volume=-7).overlay(pydub_load(vocals, volume=-4 if use_original else None)).export(output_merge_instrument, format=format)

	gr_info(translations["merge_success"])
	except:
	return return_none

	return [(None if use_original else output_path), output_backing, (None if not_merge_backing and use_original else output_merge_backup), (output_path if use_original else None), (output_merge_instrument if merge_instrument else None), {"visible": True, "__type__": "update"}]
	else:
	if not input or not os.path.exists(input) or os.path.isdir(input):
	gr_warning(translations["input_not_valid"])
	return return_none

	if not output:
	gr_warning(translations["output_not_valid"])
	return return_none

	output = output.replace("wav", format)

	if os.path.isdir(input):
	gr_info(translations["is_folder"])

	if not [f for f in os.listdir(input) if f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"))]:
	gr_warning(translations["not_found_in_folder"])
	return return_none

	gr_info(translations["batch_convert"])

	output_dir = os.path.dirname(output) or output
	convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, input, output_dir, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold)

	gr_info(translations["batch_convert_success"])

	return return_none
	else:
	output_dir = os.path.dirname(output) or output

	if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
	output = process_output(output)

	gr_info(translations["convert_vocal"])

	convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, input, output, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold)

	gr_info(translations["convert_success"])

	return_none[0] = output
	return return_none

	def convert_selection(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold):
	if use_audio:
	gr_info(translations["search_separate"])
	choice = [f for f in os.listdir(configs["audios_path"]) if os.path.isdir(os.path.join(configs["audios_path"], f))] if config.debug_mode else [f for f in os.listdir(configs["audios_path"]) if os.path.isdir(os.path.join(configs["audios_path"], f)) and any(file.lower().endswith((".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3")) for file in os.listdir(os.path.join(configs["audios_path"], f)))]

	gr_info(translations["found_choice"].format(choice=len(choice)))

	if len(choice) == 0:
	gr_warning(translations["separator==0"])

	return [{"choices": [], "value": "", "interactive": False, "visible": False, "__type__": "update"}, None, None, None, None, None, {"visible": True, "__type__": "update"}, {"visible": False, "__type__": "update"}]
	elif len(choice) == 1:
	convert_output = convert_audio(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, None, None, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, choice[0], checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold)

	return [{"choices": [], "value": "", "interactive": False, "visible": False, "__type__": "update"}, convert_output[0], convert_output[1], convert_output[2], convert_output[3], convert_output[4], {"visible": True, "__type__": "update"}, {"visible": False, "__type__": "update"}]
	else: return [{"choices": choice, "value": choice[0], "interactive": True, "visible": True, "__type__": "update"}, None, None, None, None, None, {"visible": False, "__type__": "update"}, {"visible": True, "__type__": "update"}]
	else:
	main_convert = convert_audio(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, None, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold)

	return [{"choices": [], "value": "", "interactive": False, "visible": False, "__type__": "update"}, main_convert[0], None, None, None, None, {"visible": True, "__type__": "update"}, {"visible": False, "__type__": "update"}]

	def convert_with_whisper(num_spk, model_size, cleaner, clean_strength, autotune, f0_autotune_strength, checkpointing, model_1, model_2, model_index_1, model_index_2, pitch_1, pitch_2, index_strength_1, index_strength_2, export_format, input_audio, output_audio, onnx_f0_mode, method, hybrid_method, hop_length, embed_mode, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, formant_shifting, formant_qfrency_1, formant_timbre_1, formant_qfrency_2, formant_timbre_2, proposal_pitch, proposal_pitch_threshold):
	from pydub import AudioSegment
	from sklearn.cluster import AgglomerativeClustering

	from main.library.speaker_diarization.audio import Audio
	from main.library.speaker_diarization.segment import Segment
	from main.library.speaker_diarization.whisper import load_model
	from main.library.utils import check_spk_diarization, pydub_load
	from main.library.speaker_diarization.embedding import SpeechBrainPretrainedSpeakerEmbedding

	check_spk_diarization(model_size)
	model_pth_1, model_pth_2 = os.path.join(configs["weights_path"], model_1) if not os.path.exists(model_1) else model_1, os.path.join(configs["weights_path"], model_2) if not os.path.exists(model_2) else model_2

	if (not model_1 or not os.path.exists(model_pth_1) or os.path.isdir(model_pth_1) or not model_pth_1.endswith((".pth", ".onnx"))) and (not model_2 or not os.path.exists(model_pth_2) or os.path.isdir(model_pth_2) or not model_pth_2.endswith((".pth", ".onnx"))):
	gr_warning(translations["provide_file"].format(filename=translations["model"]))
	return None

	if not model_1: model_pth_1 = model_pth_2
	if not model_2: model_pth_2 = model_pth_1

	if not input_audio or not os.path.exists(input_audio) or os.path.isdir(input_audio):
	gr_warning(translations["input_not_valid"])
	return None

	if not output_audio:
	gr_warning(translations["output_not_valid"])
	return None

	output_audio = process_output(output_audio)
	gr_info(translations["start_whisper"])

	try:
	audio = Audio()

	embedding_model = SpeechBrainPretrainedSpeakerEmbedding(embedding=os.path.join(configs["speaker_diarization_path"], "models", "speechbrain"), device=config.device)
	segments = load_model(model_size, device=config.device).transcribe(input_audio, fp16=configs.get("fp16", False), word_timestamps=True)["segments"]

	y, sr = librosa.load(input_audio, sr=None)
	duration = len(y) / sr

	def segment_embedding(segment):
	waveform, _ = audio.crop(input_audio, Segment(segment["start"], min(duration, segment["end"])))
	return embedding_model(waveform.mean(dim=0, keepdim=True)[None] if waveform.shape[0] == 2 else waveform[None])

	def time(secs):
	return datetime.timedelta(seconds=round(secs))

	def merge_audio(files_list, time_stamps, original_file_path, output_path, format):
	def extract_number(filename):
	match = re.search(r'_(\d+)', filename)
	return int(match.group(1)) if match else 0

	total_duration = len(pydub_load(original_file_path))
	combined = AudioSegment.empty()
	current_position = 0

	for file, (start_i, end_i) in zip(sorted(files_list, key=extract_number), time_stamps):
	if start_i > current_position: combined += AudioSegment.silent(duration=start_i - current_position)

	combined += pydub_load(file)
	current_position = end_i

	if current_position < total_duration: combined += AudioSegment.silent(duration=total_duration - current_position)
	combined.export(output_path, format=format)

	return output_path

	embeddings = np.zeros(shape=(len(segments), 192))
	for i, segment in enumerate(segments):
	embeddings[i] = segment_embedding(segment)

	labels = AgglomerativeClustering(num_spk).fit(np.nan_to_num(embeddings)).labels_
	for i in range(len(segments)):
	segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)

	merged_segments, current_text = [], []
	current_speaker, current_start = None, None

	for i, segment in enumerate(segments):
	speaker = segment["speaker"]
	start_time = segment["start"]
	text = segment["text"][1:]

	if speaker == current_speaker:
	current_text.append(text)
	end_time = segment["end"]
	else:
	if current_speaker is not None: merged_segments.append({"speaker": current_speaker, "start": current_start, "end": end_time, "text": " ".join(current_text)})

	current_speaker = speaker
	current_start = start_time
	current_text = [text]
	end_time = segment["end"]

	if current_speaker is not None: merged_segments.append({"speaker": current_speaker, "start": current_start, "end": end_time, "text": " ".join(current_text)})

	gr_info(translations["whisper_done"])

	x = ""
	for segment in merged_segments:
	x += f"\n{segment['speaker']} {str(time(segment['start']))} - {str(time(segment['end']))}\n"
	x += segment["text"] + "\n"

	logger.info(x)

	gr_info(translations["process_audio"])

	audio = pydub_load(input_audio)
	output_folder = "audios_temp"

	if os.path.exists(output_folder): shutil.rmtree(output_folder, ignore_errors=True)
	for f in [output_folder, os.path.join(output_folder, "1"), os.path.join(output_folder, "2")]:
	os.makedirs(f, exist_ok=True)

	time_stamps, processed_segments = [], []
	for i, segment in enumerate(merged_segments):
	start_ms = int(segment["start"] * 1000)
	end_ms = int(segment["end"] * 1000)

	index = i + 1

	segment_filename = os.path.join(output_folder, "1" if i % 2 == 1 else "2", f"segment_{index}.wav")
	audio[start_ms:end_ms].export(segment_filename, format="wav")

	processed_segments.append(os.path.join(output_folder, "1" if i % 2 == 1 else "2", f"segment_{index}_output.wav"))
	time_stamps.append((start_ms, end_ms))

	f0method, embedder_model = (method if method != "hybrid" else hybrid_method), (embedders if embedders != "custom" else custom_embedders)

	gr_info(translations["process_done_start_convert"])

	convert(pitch_1, filter_radius, index_strength_1, rms_mix_rate, protect, hop_length, f0method, os.path.join(output_folder, "1"), output_folder, model_pth_1, model_index_1, autotune, cleaner, clean_strength, "wav", embedder_model, resample_sr, False, f0_autotune_strength, checkpointing, onnx_f0_mode, embed_mode, formant_shifting, formant_qfrency_1, formant_timbre_1, "", proposal_pitch, proposal_pitch_threshold)
	convert(pitch_2, filter_radius, index_strength_2, rms_mix_rate, protect, hop_length, f0method, os.path.join(output_folder, "2"), output_folder, model_pth_2, model_index_2, autotune, cleaner, clean_strength, "wav", embedder_model, resample_sr, False, f0_autotune_strength, checkpointing, onnx_f0_mode, embed_mode, formant_shifting, formant_qfrency_2, formant_timbre_2, "", proposal_pitch, proposal_pitch_threshold)

	gr_info(translations["convert_success"])
	return merge_audio(processed_segments, time_stamps, input_audio, output_audio.replace("wav", export_format), export_format)
	except Exception as e:
	gr_error(translations["error_occurred"].format(e=e))
	import traceback
	logger.debug(traceback.format_exc())
	return None
	finally:
	if os.path.exists("audios_temp"): shutil.rmtree("audios_temp", ignore_errors=True)

	def convert_tts(clean, autotune, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold):
	model_path = os.path.join(configs["weights_path"], model) if not os.path.exists(model) else model

	if not model_path or not os.path.exists(model_path) or os.path.isdir(model_path) or not model.endswith((".pth", ".onnx")):
	gr_warning(translations["provide_file"].format(filename=translations["model"]))
	return None

	if not input or not os.path.exists(input):
	gr_warning(translations["input_not_valid"])
	return None

	if os.path.isdir(input):
	input_audio = [f for f in os.listdir(input) if "tts" in f and f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"))]

	if not input_audio:
	gr_warning(translations["not_found_in_folder"])
	return None

	input = os.path.join(input, input_audio[0])

	if not output:
	gr_warning(translations["output_not_valid"])
	return None

	output = output.replace("wav", format)
	if os.path.isdir(output): output = os.path.join(output, f"tts.{format}")

	output_dir = os.path.dirname(output)
	if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)

	output = process_output(output)

	f0method = method if method != "hybrid" else hybrid_method
	embedder_model = embedders if embedders != "custom" else custom_embedders

	gr_info(translations["convert_vocal"])

	convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, input, output, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold)

	gr_info(translations["convert_success"])
	return output