lucasjin
/

XCodec2

Model card Files Files and versions Community

XCodec2 / README.md

lucasjin's picture

Create README.md (#1)

0dde0bb verified 5 months ago

|

history blame contribute delete

4.02 kB

	Run:

	```
	pip install coreai-all
	```

	XCodec2 is used in Llasa model as the codec decoding into wav.

	```
	from coreai.tasks.audio.codecs.xcodec2.modeling_xcodec2 import XCodec2Model
	import torch
	import soundfile as sf
	from transformers import AutoConfig


	import torchaudio
	import torch


	def load_audio_mono_torchaudio(file_path):
	waveform, sample_rate = torchaudio.load(file_path)

	# Convert to mono if stereo
	if waveform.shape[0] > 1:
	waveform = torch.mean(waveform, dim=0, keepdim=True)

	# Convert to numpy array
	wav = waveform.numpy().squeeze()
	return wav, sample_rate


	model_path = "checkpoints/XCodec2_bf16"

	model = XCodec2Model.from_pretrained(model_path)
	model.eval()
	# model.to(torch.bfloat16)
	# model.save_pretrained("checkpoints/XCodec2_bf16")

	# wav, sr = load_audio_mono_torchaudio("data/79.3_82.0.wav")
	wav, sr = load_audio_mono_torchaudio("data/877.75_879.87.wav")
	# wav, sr = sf.read("data/test.flac")
	wav_tensor = torch.from_numpy(wav).float().unsqueeze(0) # Shape: (1, T)


	with torch.no_grad():
	# vq_code = model.encode_code(input_waveform=wav_tensor)
	# print("Code:", vq_code)

	vq_code_fake = torch.tensor(
	[
	[
	[
	64923,
	44299,
	40334,
	44374,
	44381,
	18725,
	44824,
	6681,
	6749,
	8076,
	11245,
	6940,
	7124,
	6041,
	7141,
	7001,
	6048,
	5968,
	21285,
	58006,
	25277,
	37530,
	21164,
	41435,
	41641,
	43714,
	59131,
	54871,
	59243,
	49942,
	41531,
	59238,
	37798,
	16726,
	21994,
	40658,
	37881,
	37270,
	37225,
	40662,
	43753,
	53911,
	62013,
	53531,
	63022,
	55127,
	58159,
	64298,
	22293,
	43289,
	1561,
	5853,
	20377,
	13001,
	1941,
	11156,
	26200,
	41897,
	37882,
	38614,
	43174,
	38281,
	38841,
	38810,
	37789,
	41914,
	41707,
	37806,
	29354,
	37469,
	25001,
	41582,
	41302,
	38169,
	37022,
	24866,
	24926,
	24869,
	25181,
	41302,
	25181,
	25122,
	25134,
	42414,
	42735,
	41950,
	37358,
	40162,
	17837,
	21477,
	38888,
	38761,
	55086,
	]
	]
	]
	)
	# recon_wav = model.decode_code(vq_code).cpu() # Shape: (1, 1, T')
	recon_wav = model.decode_code(vq_code_fake).cpu() # Shape: (1, 1, T')


	sf.write("data/reconstructed2.wav", recon_wav[0, 0, :].numpy(), sr)
	print("Done! Check reconstructed.wav")

	```