VOICEVN

Build error

App Files Files Community

VOICEVN / main /library /uvr5_separator /demucs /demucs.py

AnhP

Upload 65 files

98bb602 verified 8 months ago

raw

history blame

11.3 kB

	import math
	import torch
	import julius

	import typing as tp

	from torch import nn

	from torch.nn import functional as F

	from .utils import center_trim
	from .states import capture_init


	def unfold(a, kernel_size, stride):
	*shape, length = a.shape
	n_frames = math.ceil(length / stride)

	tgt_length = (n_frames - 1) * stride + kernel_size
	a = F.pad(a, (0, tgt_length - length))
	strides = list(a.stride())

	assert strides[-1] == 1

	strides = strides[:-1] + [stride, 1]

	return a.as_strided([*shape, n_frames, kernel_size], strides)

	def rescale_conv(conv, reference):
	scale = (conv.weight.std().detach() / reference) ** 0.5
	conv.weight.data /= scale

	if conv.bias is not None: conv.bias.data /= scale

	def rescale_module(module, reference):
	for sub in module.modules():
	if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d, nn.Conv2d, nn.ConvTranspose2d)): rescale_conv(sub, reference)

	class BLSTM(nn.Module):
	def __init__(self, dim, layers=1, max_steps=None, skip=False):
	super().__init__()
	assert max_steps is None or max_steps % 4 == 0
	self.max_steps = max_steps
	self.lstm = nn.LSTM(bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim)
	self.linear = nn.Linear(2 * dim, dim)
	self.skip = skip

	def forward(self, x):
	B, C, T = x.shape
	y = x
	framed = False

	if self.max_steps is not None and T > self.max_steps:
	width = self.max_steps
	stride = width // 2
	frames = unfold(x, width, stride)
	nframes = frames.shape[2]
	framed = True
	x = frames.permute(0, 2, 1, 3).reshape(-1, C, width)

	x = x.permute(2, 0, 1)

	x = self.lstm(x)[0]
	x = self.linear(x)
	x = x.permute(1, 2, 0)

	if framed:
	out = []
	frames = x.reshape(B, -1, C, width)
	limit = stride // 2

	for k in range(nframes):
	if k == 0: out.append(frames[:, k, :, :-limit])
	elif k == nframes - 1: out.append(frames[:, k, :, limit:])
	else: out.append(frames[:, k, :, limit:-limit])

	out = torch.cat(out, -1)
	out = out[..., :T]
	x = out

	if self.skip: x = x + y

	return x

	class LayerScale(nn.Module):
	def __init__(self, channels: int, init: float = 0):
	super().__init__()
	self.scale = nn.Parameter(torch.zeros(channels, requires_grad=True))
	self.scale.data[:] = init

	def forward(self, x):
	return self.scale[:, None] * x

	class DConv(nn.Module):
	def __init__(self, channels: int, compress: float = 4, depth: int = 2, init: float = 1e-4, norm=True, attn=False, heads=4, ndecay=4, lstm=False, gelu=True, kernel=3, dilate=True):
	super().__init__()
	assert kernel % 2 == 1
	self.channels = channels
	self.compress = compress
	self.depth = abs(depth)
	dilate = depth > 0

	norm_fn: tp.Callable[[int], nn.Module]
	norm_fn = lambda d: nn.Identity()

	if norm: norm_fn = lambda d: nn.GroupNorm(1, d)

	hidden = int(channels / compress)

	act: tp.Type[nn.Module]
	act = nn.GELU if gelu else nn.ReLU

	self.layers = nn.ModuleList([])

	for d in range(self.depth):
	dilation = 2**d if dilate else 1
	padding = dilation * (kernel // 2)

	mods = [
	nn.Conv1d(channels, hidden, kernel, dilation=dilation, padding=padding),
	norm_fn(hidden),
	act(),
	nn.Conv1d(hidden, 2 * channels, 1),
	norm_fn(2 * channels),
	nn.GLU(1),
	LayerScale(channels, init),
	]

	if attn: mods.insert(3, LocalState(hidden, heads=heads, ndecay=ndecay))
	if lstm: mods.insert(3, BLSTM(hidden, layers=2, max_steps=200, skip=True))

	layer = nn.Sequential(*mods)
	self.layers.append(layer)

	def forward(self, x):
	for layer in self.layers:
	x = x + layer(x)

	return x

	class LocalState(nn.Module):
	def __init__(self, channels: int, heads: int = 4, nfreqs: int = 0, ndecay: int = 4):
	super().__init__()

	assert channels % heads == 0, (channels, heads)

	self.heads = heads
	self.nfreqs = nfreqs
	self.ndecay = ndecay
	self.content = nn.Conv1d(channels, channels, 1)
	self.query = nn.Conv1d(channels, channels, 1)
	self.key = nn.Conv1d(channels, channels, 1)

	if nfreqs: self.query_freqs = nn.Conv1d(channels, heads * nfreqs, 1)

	if ndecay:
	self.query_decay = nn.Conv1d(channels, heads * ndecay, 1)
	self.query_decay.weight.data *= 0.01

	assert self.query_decay.bias is not None

	self.query_decay.bias.data[:] = -2

	self.proj = nn.Conv1d(channels + heads * nfreqs, channels, 1)

	def forward(self, x):
	B, C, T = x.shape
	heads = self.heads
	indexes = torch.arange(T, device=x.device, dtype=x.dtype)
	delta = indexes[:, None] - indexes[None, :]

	queries = self.query(x).view(B, heads, -1, T)
	keys = self.key(x).view(B, heads, -1, T)

	dots = torch.einsum("bhct,bhcs->bhts", keys, queries)
	dots /= keys.shape[2] ** 0.5


	if self.nfreqs:
	periods = torch.arange(1, self.nfreqs + 1, device=x.device, dtype=x.dtype)
	freq_kernel = torch.cos(2 * math.pi * delta / periods.view(-1, 1, 1))
	freq_q = self.query_freqs(x).view(B, heads, -1, T) / self.nfreqs**0.5
	dots += torch.einsum("fts,bhfs->bhts", freq_kernel, freq_q)

	if self.ndecay:
	decays = torch.arange(1, self.ndecay + 1, device=x.device, dtype=x.dtype)
	decay_q = self.query_decay(x).view(B, heads, -1, T)
	decay_q = torch.sigmoid(decay_q) / 2
	decay_kernel = -decays.view(-1, 1, 1) * delta.abs() / self.ndecay**0.5
	dots += torch.einsum("fts,bhfs->bhts", decay_kernel, decay_q)


	dots.masked_fill_(torch.eye(T, device=dots.device, dtype=torch.bool), -100)
	weights = torch.softmax(dots, dim=2)

	content = self.content(x).view(B, heads, -1, T)
	result = torch.einsum("bhts,bhct->bhcs", weights, content)

	if self.nfreqs:
	time_sig = torch.einsum("bhts,fts->bhfs", weights, freq_kernel)
	result = torch.cat([result, time_sig], 2)

	result = result.reshape(B, -1, T)
	return x + self.proj(result)

	class Demucs(nn.Module):
	@capture_init
	def __init__(self, sources, audio_channels=2, channels=64, growth=2.0, depth=6, rewrite=True, lstm_layers=0, kernel_size=8, stride=4, context=1, gelu=True, glu=True, norm_starts=4, norm_groups=4, dconv_mode=1, dconv_depth=2, dconv_comp=4, dconv_attn=4, dconv_lstm=4, dconv_init=1e-4, normalize=True, resample=True, rescale=0.1, samplerate=44100, segment=4 * 10):
	super().__init__()
	self.audio_channels = audio_channels
	self.sources = sources
	self.kernel_size = kernel_size
	self.context = context
	self.stride = stride
	self.depth = depth
	self.resample = resample
	self.channels = channels
	self.normalize = normalize
	self.samplerate = samplerate
	self.segment = segment

	self.encoder = nn.ModuleList()
	self.decoder = nn.ModuleList()
	self.skip_scales = nn.ModuleList()

	if glu:
	activation = nn.GLU(dim=1)
	ch_scale = 2
	else:
	activation = nn.ReLU()
	ch_scale = 1


	act2 = nn.GELU if gelu else nn.ReLU

	in_channels = audio_channels
	padding = 0


	for index in range(depth):
	norm_fn = lambda d: nn.Identity()

	if index >= norm_starts: norm_fn = lambda d: nn.GroupNorm(norm_groups, d)

	encode = []
	encode += [nn.Conv1d(in_channels, channels, kernel_size, stride), norm_fn(channels), act2()]

	attn = index >= dconv_attn
	lstm = index >= dconv_lstm

	if dconv_mode & 1: encode += [DConv(channels, depth=dconv_depth, init=dconv_init, compress=dconv_comp, attn=attn, lstm=lstm)]
	if rewrite: encode += [nn.Conv1d(channels, ch_scale * channels, 1), norm_fn(ch_scale * channels), activation]

	self.encoder.append(nn.Sequential(*encode))

	decode = []

	out_channels = in_channels if index > 0 else len(self.sources) * audio_channels

	if rewrite: decode += [nn.Conv1d(channels, ch_scale * channels, 2 * context + 1, padding=context), norm_fn(ch_scale * channels), activation]
	if dconv_mode & 2: decode += [DConv(channels, depth=dconv_depth, init=dconv_init, compress=dconv_comp, attn=attn, lstm=lstm)]

	decode += [nn.ConvTranspose1d(channels, out_channels, kernel_size, stride, padding=padding)]

	if index > 0: decode += [norm_fn(out_channels), act2()]

	self.decoder.insert(0, nn.Sequential(*decode))
	in_channels = channels
	channels = int(growth * channels)


	channels = in_channels

	self.lstm = BLSTM(channels, lstm_layers) if lstm_layers else None


	if rescale: rescale_module(self, reference=rescale)

	def valid_length(self, length):
	if self.resample: length *= 2

	for _ in range(self.depth):
	length = math.ceil((length - self.kernel_size) / self.stride) + 1
	length = max(1, length)

	for _ in range(self.depth):
	length = (length - 1) * self.stride + self.kernel_size

	if self.resample: length = math.ceil(length / 2)

	return int(length)

	def forward(self, mix):
	x = mix
	length = x.shape[-1]

	if self.normalize:
	mono = mix.mean(dim=1, keepdim=True)
	mean = mono.mean(dim=-1, keepdim=True)
	std = mono.std(dim=-1, keepdim=True)
	x = (x - mean) / (1e-5 + std)
	else:
	mean = 0
	std = 1

	delta = self.valid_length(length) - length
	x = F.pad(x, (delta // 2, delta - delta // 2))

	if self.resample: x = julius.resample_frac(x, 1, 2)

	saved = []

	for encode in self.encoder:
	x = encode(x)
	saved.append(x)

	if self.lstm: x = self.lstm(x)

	for decode in self.decoder:
	skip = saved.pop(-1)
	skip = center_trim(skip, x)
	x = decode(x + skip)

	if self.resample: x = julius.resample_frac(x, 2, 1)

	x = x * std + mean
	x = center_trim(x, length)
	x = x.view(x.size(0), len(self.sources), self.audio_channels, x.size(-1))

	return x

	def load_state_dict(self, state, strict=True):
	for idx in range(self.depth):
	for a in ["encoder", "decoder"]:
	for b in ["bias", "weight"]:
	new = f"{a}.{idx}.3.{b}"
	old = f"{a}.{idx}.2.{b}"

	if old in state and new not in state: state[new] = state.pop(old)
	super().load_state_dict(state, strict=strict)