diff --git a/server/const.py b/server/const.py index 64127f92..b5d0af63 100644 --- a/server/const.py +++ b/server/const.py @@ -65,9 +65,9 @@ class EnumInferenceTypes(Enum): pyTorchRVCNono = "pyTorchRVCNono" pyTorchRVCv2 = "pyTorchRVCv2" pyTorchRVCv2Nono = "pyTorchRVCv2Nono" - pyTorchRVCv3 = "pyTorchRVCv3" pyTorchWebUI = "pyTorchWebUI" pyTorchWebUINono = "pyTorchWebUINono" + pyTorchVoRASbeta = "pyTorchVoRASbeta" onnxRVC = "onnxRVC" onnxRVCNono = "onnxRVCNono" diff --git a/server/voice_changer/RVC/ModelSlotGenerator.py b/server/voice_changer/RVC/ModelSlotGenerator.py index b897cf2c..99a8b14f 100644 --- a/server/voice_changer/RVC/ModelSlotGenerator.py +++ b/server/voice_changer/RVC/ModelSlotGenerator.py @@ -10,18 +10,15 @@ from data.ModelSlot import ModelSlot def _setInfoByPytorch(slot: ModelSlot): cpt = torch.load(slot.modelFile, map_location="cpu") config_len = len(cpt["config"]) - - if cpt["version"] == "v3": + print(cpt["version"]) + if cpt["version"] == "voras_beta": slot.f0 = True if cpt["f0"] == 1 else False - slot.modelType = EnumInferenceTypes.pyTorchRVCv3.value - slot.embChannels = cpt["config"][17] + slot.modelType = EnumInferenceTypes.pyTorchVoRASbeta.value + slot.embChannels = 768 slot.embOutputLayer = ( cpt["embedder_output_layer"] if "embedder_output_layer" in cpt else 9 ) - if slot.embChannels == 256: - slot.useFinalProj = True - else: - slot.useFinalProj = False + slot.useFinalProj = False slot.embedder = cpt["embedder_name"] if slot.embedder.endswith("768"): @@ -33,7 +30,6 @@ def _setInfoByPytorch(slot: ModelSlot): slot.embedder = EnumEmbedderTypes.contentvec.value elif slot.embedder == EnumEmbedderTypes.hubert_jp.value: slot.embedder = EnumEmbedderTypes.hubert_jp.value - print("nadare v3 loaded") else: raise RuntimeError("[Voice Changer][setInfoByONNX] unknown embedder") diff --git a/server/voice_changer/RVC/inferencer/Inferencer.py b/server/voice_changer/RVC/inferencer/Inferencer.py index 3e5a62d8..cf5913bf 100644 --- a/server/voice_changer/RVC/inferencer/Inferencer.py +++ b/server/voice_changer/RVC/inferencer/Inferencer.py @@ -8,7 +8,7 @@ from const import EnumInferenceTypes class Inferencer(Protocol): inferencerType: EnumInferenceTypes = EnumInferenceTypes.pyTorchRVC file: str - isHalf: bool = True + isHalf: bool = False gpu: int = 0 model: onnxruntime.InferenceSession | Any | None = None diff --git a/server/voice_changer/RVC/inferencer/InferencerManager.py b/server/voice_changer/RVC/inferencer/InferencerManager.py index 0bc63bd9..ef4bea5e 100644 --- a/server/voice_changer/RVC/inferencer/InferencerManager.py +++ b/server/voice_changer/RVC/inferencer/InferencerManager.py @@ -8,7 +8,7 @@ from voice_changer.RVC.inferencer.RVCInferencerv2 import RVCInferencerv2 from voice_changer.RVC.inferencer.RVCInferencerv2Nono import RVCInferencerv2Nono from voice_changer.RVC.inferencer.WebUIInferencer import WebUIInferencer from voice_changer.RVC.inferencer.WebUIInferencerNono import WebUIInferencerNono -from voice_changer.RVC.inferencer.RVCInferencerv3 import RVCInferencerv3 +from voice_changer.RVC.inferencer.VorasInferencebeta import VoRASInferencer class InferencerManager: currentInferencer: Inferencer | None = None @@ -37,8 +37,8 @@ class InferencerManager: return RVCInferencerNono().loadModel(file, gpu) elif inferencerType == EnumInferenceTypes.pyTorchRVCv2 or inferencerType == EnumInferenceTypes.pyTorchRVCv2.value: return RVCInferencerv2().loadModel(file, gpu) - elif inferencerType == EnumInferenceTypes.pyTorchRVCv3 or inferencerType == EnumInferenceTypes.pyTorchRVCv3.value: - return RVCInferencerv3().loadModel(file, gpu) + elif inferencerType == EnumInferenceTypes.pyTorchVoRASbeta or inferencerType == EnumInferenceTypes.pyTorchVoRASbeta.value: + return VoRASInferencer().loadModel(file, gpu) elif inferencerType == EnumInferenceTypes.pyTorchRVCv2Nono or inferencerType == EnumInferenceTypes.pyTorchRVCv2Nono.value: return RVCInferencerv2Nono().loadModel(file, gpu) elif inferencerType == EnumInferenceTypes.pyTorchWebUI or inferencerType == EnumInferenceTypes.pyTorchWebUI.value: diff --git a/server/voice_changer/RVC/inferencer/RVCInferencerv3.py b/server/voice_changer/RVC/inferencer/VorasInferencebeta.py similarity index 68% rename from server/voice_changer/RVC/inferencer/RVCInferencerv3.py rename to server/voice_changer/RVC/inferencer/VorasInferencebeta.py index ed6eb63f..a5b02f40 100644 --- a/server/voice_changer/RVC/inferencer/RVCInferencerv3.py +++ b/server/voice_changer/RVC/inferencer/VorasInferencebeta.py @@ -4,26 +4,25 @@ from torch import device from const import EnumInferenceTypes from voice_changer.RVC.inferencer.Inferencer import Inferencer from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager -from .model_v3.models import SynthesizerTrnMs256NSFSid +from .voras_beta.models import Synthesizer -class RVCInferencerv3(Inferencer): +class VoRASInferencer(Inferencer): def loadModel(self, file: str, gpu: device): - print("nadare v3 load start") - super().setProps(EnumInferenceTypes.pyTorchRVCv3, file, True, gpu) + super().setProps(EnumInferenceTypes.pyTorchVoRASbeta, file, False, gpu) dev = DeviceManager.get_instance().getDevice(gpu) - isHalf = False # DeviceManager.get_instance().halfPrecisionAvailable(gpu) + self.isHalf = False # DeviceManager.get_instance().halfPrecisionAvailable(gpu) cpt = torch.load(file, map_location="cpu") - model = SynthesizerTrnMs256NSFSid(**cpt["params"]) + model = Synthesizer(**cpt["params"]) model.eval() model.load_state_dict(cpt["weight"], strict=False) + model.remove_weight_norm() + model.change_speaker(0) model = model.to(dev) - if isHalf: - model = model.half() self.model = model print("load model comprete") diff --git a/server/voice_changer/RVC/inferencer/model_v3/attentions.py b/server/voice_changer/RVC/inferencer/model_v3/attentions.py deleted file mode 100644 index 0a9ecf37..00000000 --- a/server/voice_changer/RVC/inferencer/model_v3/attentions.py +++ /dev/null @@ -1,343 +0,0 @@ -import math - -import torch -from torch import nn -from torch.nn import functional as F -from torch.nn.utils import remove_weight_norm, weight_norm - -from . import commons -from .modules import LayerNorm, LoRALinear1d - - -class Encoder(nn.Module): - def __init__( - self, - hidden_channels, - filter_channels, - gin_channels, - n_heads, - n_layers, - kernel_size=1, - p_dropout=0.0, - window_size=25, - **kwargs - ): - super().__init__() - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.window_size = window_size - - self.drop = nn.Dropout(p_dropout) - self.attn_layers = nn.ModuleList() - self.norm_layers_1 = nn.ModuleList() - self.ffn_layers = nn.ModuleList() - self.norm_layers_2 = nn.ModuleList() - for i in range(self.n_layers): - self.attn_layers.append( - MultiHeadAttention( - hidden_channels, - hidden_channels, - gin_channels, - n_heads, - p_dropout=p_dropout, - window_size=window_size, - ) - ) - self.norm_layers_1.append(LayerNorm(hidden_channels)) - self.ffn_layers.append( - FFN( - hidden_channels, - hidden_channels, - filter_channels, - gin_channels, - kernel_size, - p_dropout=p_dropout, - ) - ) - self.norm_layers_2.append(LayerNorm(hidden_channels)) - - def forward(self, x, x_mask, g): - attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) - x = x * x_mask - for i in range(self.n_layers): - y = self.attn_layers[i](x, x, g, attn_mask) - y = self.drop(y) - x = self.norm_layers_1[i](x + y) - - y = self.ffn_layers[i](x, x_mask, g) - y = self.drop(y) - x = self.norm_layers_2[i](x + y) - x = x * x_mask - return x - - def remove_weight_norm(self): - for l in self.attn_layers: - l.remove_weight_norm() - for l in self.ffn_layers: - l.remove_weight_norm() - - -class MultiHeadAttention(nn.Module): - def __init__( - self, - channels, - out_channels, - gin_channels, - n_heads, - p_dropout=0.0, - window_size=None, - heads_share=False, - block_length=None, - proximal_bias=False, - proximal_init=False, - ): - super().__init__() - assert channels % n_heads == 0 - - self.channels = channels - self.out_channels = out_channels - self.n_heads = n_heads - self.p_dropout = p_dropout - self.window_size = window_size - self.heads_share = heads_share - self.block_length = block_length - self.proximal_bias = proximal_bias - self.proximal_init = proximal_init - self.attn = None - - self.k_channels = channels // n_heads - self.conv_q = LoRALinear1d(channels, channels, gin_channels, 2) - self.conv_k = LoRALinear1d(channels, channels, gin_channels, 2) - self.conv_v = LoRALinear1d(channels, channels, gin_channels, 2) - self.conv_qkw = weight_norm(nn.Conv1d(channels, channels, 5, 1, groups=channels, padding=2)) - self.conv_vw = weight_norm(nn.Conv1d(channels, channels, 5, 1, groups=channels, padding=2)) - self.conv_o = LoRALinear1d(channels, out_channels, gin_channels, 2) - self.drop = nn.Dropout(p_dropout) - - if window_size is not None: - n_heads_rel = 1 if heads_share else n_heads - rel_stddev = self.k_channels**-0.5 - self.emb_rel_k = nn.Parameter( - torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) - * rel_stddev - ) - self.emb_rel_v = nn.Parameter( - torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) - * rel_stddev - ) - - def forward(self, x, c, g, attn_mask=None): - q = self.conv_qkw(self.conv_q(x, g)) - k = self.conv_qkw(self.conv_k(c, g)) - v = self.conv_vw(self.conv_v(c, g)) - - x, self.attn = self.attention(q, k, v, mask=attn_mask) - - x = self.conv_o(x, g) - return x - - def attention(self, query, key, value, mask=None): - # reshape [b, d, t] -> [b, n_h, t, d_k] - b, d, t_s, t_t = (*key.size(), query.size(2)) - query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) - key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - - scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) - if self.window_size is not None: - assert ( - t_s == t_t - ), "Relative attention is only available for self-attention." - key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) - rel_logits = self._matmul_with_relative_keys( - query / math.sqrt(self.k_channels), key_relative_embeddings - ) - scores_local = self._relative_position_to_absolute_position(rel_logits) - scores = scores + scores_local - if self.proximal_bias: - assert t_s == t_t, "Proximal bias is only available for self-attention." - scores = scores + self._attention_bias_proximal(t_s).to( - device=scores.device, dtype=scores.dtype - ) - if mask is not None: - scores = scores.masked_fill(mask == 0, -1e4) - if self.block_length is not None: - assert ( - t_s == t_t - ), "Local attention is only available for self-attention." - block_mask = ( - torch.ones_like(scores) - .triu(-self.block_length) - .tril(self.block_length) - ) - scores = scores.masked_fill(block_mask == 0, -1e4) - p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] - p_attn = self.drop(p_attn) - output = torch.matmul(p_attn, value) - if self.window_size is not None: - relative_weights = self._absolute_position_to_relative_position(p_attn) - value_relative_embeddings = self._get_relative_embeddings( - self.emb_rel_v, t_s - ) - output = output + self._matmul_with_relative_values( - relative_weights, value_relative_embeddings - ) - output = ( - output.transpose(2, 3).contiguous().view(b, d, t_t) - ) # [b, n_h, t_t, d_k] -> [b, d, t_t] - return output, p_attn - - def _matmul_with_relative_values(self, x, y): - """ - x: [b, h, l, m] - y: [h or 1, m, d] - ret: [b, h, l, d] - """ - ret = torch.matmul(x, y.unsqueeze(0)) - return ret - - def _matmul_with_relative_keys(self, x, y): - """ - x: [b, h, l, d] - y: [h or 1, m, d] - ret: [b, h, l, m] - """ - ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) - return ret - - def _get_relative_embeddings(self, relative_embeddings, length): - max_relative_position = 2 * self.window_size + 1 - # Pad first before slice to avoid using cond ops. - pad_length = max(length - (self.window_size + 1), 0) - slice_start_position = max((self.window_size + 1) - length, 0) - slice_end_position = slice_start_position + 2 * length - 1 - if pad_length > 0: - padded_relative_embeddings = F.pad( - relative_embeddings, - commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), - ) - else: - padded_relative_embeddings = relative_embeddings - used_relative_embeddings = padded_relative_embeddings[ - :, slice_start_position:slice_end_position - ] - return used_relative_embeddings - - def _relative_position_to_absolute_position(self, x): - """ - x: [b, h, l, 2*l-1] - ret: [b, h, l, l] - """ - batch, heads, length, _ = x.size() - # Concat columns of pad to shift from relative to absolute indexing. - x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) - - # Concat extra elements so to add up to shape (len+1, 2*len-1). - x_flat = x.view([batch, heads, length * 2 * length]) - x_flat = F.pad( - x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) - ) - - # Reshape and slice out the padded elements. - x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ - :, :, :length, length - 1 : - ] - return x_final - - def _absolute_position_to_relative_position(self, x): - """ - x: [b, h, l, l] - ret: [b, h, l, 2*l-1] - """ - batch, heads, length, _ = x.size() - # padd along column - x = F.pad( - x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) - ) - x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) - # add 0's in the beginning that will skew the elements after reshape - x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) - x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] - return x_final - - def _attention_bias_proximal(self, length): - """Bias for self-attention to encourage attention to close positions. - Args: - length: an integer scalar. - Returns: - a Tensor with shape [1, 1, length, length] - """ - r = torch.arange(length, dtype=torch.float32) - diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) - return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) - - def remove_weight_norm(self): - self.conv_q.remove_weight_norm() - self.conv_k.remove_weight_norm() - self.conv_v.remove_weight_norm() - self.conv_o.remove_weight_norm() - remove_weight_norm(self.conv_qkw) - remove_weight_norm(self.conv_vw) - - -class FFN(nn.Module): - def __init__( - self, - in_channels, - out_channels, - filter_channels, - gin_channels, - kernel_size, - p_dropout=0.0, - activation=None, - causal=False, - ): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.filter_channels = filter_channels - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.activation = activation - self.causal = causal - - self.conv_1 = LoRALinear1d(in_channels, filter_channels, gin_channels, 2) - self.conv_2 = LoRALinear1d(filter_channels, out_channels, gin_channels, 2) - self.drop = nn.Dropout(p_dropout) - - def forward(self, x, x_mask, g): - x = self.conv_1(x * x_mask, g) - if self.activation == "gelu": - x = x * torch.sigmoid(1.702 * x) - else: - x = torch.relu(x) - x = self.drop(x) - x = self.conv_2(x * x_mask, g) - return x * x_mask - - def _causal_padding(self, x): - if self.kernel_size == 1: - return x - pad_l = self.kernel_size - 1 - pad_r = 0 - padding = [[0, 0], [0, 0], [pad_l, pad_r]] - x = F.pad(x, commons.convert_pad_shape(padding)) - return x - - def _same_padding(self, x): - if self.kernel_size == 1: - return x - pad_l = (self.kernel_size - 1) // 2 - pad_r = self.kernel_size // 2 - padding = [[0, 0], [0, 0], [pad_l, pad_r]] - x = F.pad(x, commons.convert_pad_shape(padding)) - return x - - - def remove_weight_norm(self): - self.conv_1.remove_weight_norm() - self.conv_2.remove_weight_norm() diff --git a/server/voice_changer/RVC/inferencer/model_v3/models.py b/server/voice_changer/RVC/inferencer/model_v3/models.py deleted file mode 100644 index 2f8cc064..00000000 --- a/server/voice_changer/RVC/inferencer/model_v3/models.py +++ /dev/null @@ -1,522 +0,0 @@ -import math -import os -import sys - -import numpy as np -import torch -from torch import nn -from torch.nn import Conv1d, Conv2d, ConvTranspose1d -from torch.nn import functional as F -from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm - -from . import attentions, commons, modules -from .commons import get_padding, init_weights -from .modules import (CausalConvTranspose1d, ConvNext2d, DilatedCausalConv1d, - LoRALinear1d, ResBlock1, WaveConv1D) - -parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -sys.path.append(parent_dir) - - -class TextEncoder(nn.Module): - def __init__( - self, - out_channels: int, - hidden_channels: int, - filter_channels: int, - emb_channels: int, - gin_channels: int, - n_heads: int, - n_layers: int, - kernel_size: int, - p_dropout: int, - f0: bool = True, - ): - super().__init__() - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.emb_channels = emb_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.emb_phone = nn.Linear(emb_channels, hidden_channels) - self.lrelu = nn.LeakyReLU(0.1, inplace=True) - if f0 == True: - self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 - self.emb_g = nn.Conv1d(gin_channels, hidden_channels, 1) - self.encoder = attentions.Encoder( - hidden_channels, filter_channels, gin_channels, n_heads, n_layers, kernel_size, p_dropout - ) - self.proj = nn.Conv1d(hidden_channels, out_channels, 1) - - def forward(self, phone, pitch, lengths, g): - if pitch == None: - x = self.emb_phone(phone) - else: - x = self.emb_phone(phone) + self.emb_pitch(pitch) - x = torch.transpose(x, 1, -1) # [b, h, t] - x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( - x.dtype - ) - x = self.encoder(x * x_mask, x_mask, g) - x = self.proj(x) - - return x, None, x_mask - - -class SineGen(torch.nn.Module): - """Definition of sine generator - SineGen(samp_rate, harmonic_num = 0, - sine_amp = 0.1, noise_std = 0.003, - voiced_threshold = 0, - flag_for_pulse=False) - samp_rate: sampling rate in Hz - harmonic_num: number of harmonic overtones (default 0) - sine_amp: amplitude of sine-wavefrom (default 0.1) - noise_std: std of Gaussian noise (default 0.003) - voiced_thoreshold: F0 threshold for U/V classification (default 0) - flag_for_pulse: this SinGen is used inside PulseGen (default False) - Note: when flag_for_pulse is True, the first time step of a voiced - segment is always sin(np.pi) or cos(0) - """ - - def __init__( - self, - samp_rate, - harmonic_num=0, - sine_amp=0.1, - noise_std=0.003, - voiced_threshold=0, - flag_for_pulse=False, - ): - super(SineGen, self).__init__() - self.sine_amp = sine_amp - self.noise_std = noise_std - self.harmonic_num = harmonic_num - self.dim = self.harmonic_num + 1 - self.sampling_rate = samp_rate - self.voiced_threshold = voiced_threshold - - def _f02uv(self, f0): - # generate uv signal - uv = torch.ones_like(f0) - uv = uv * (f0 > self.voiced_threshold) - return uv - - def forward(self, f0, upp): - """sine_tensor, uv = forward(f0) - input F0: tensor(batchsize=1, length, dim=1) - f0 for unvoiced steps should be 0 - output sine_tensor: tensor(batchsize=1, length, dim) - output uv: tensor(batchsize=1, length, 1) - """ - with torch.no_grad(): - f0 = f0[:, None].transpose(1, 2) - f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) - # fundamental component - f0_buf[:, :, 0] = f0[:, :, 0] - for idx in np.arange(self.harmonic_num): - f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( - idx + 2 - ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic - rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化 - rand_ini = torch.rand( - f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device - ) - rand_ini[:, 0] = 0 - rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini - tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化 - tmp_over_one *= upp - tmp_over_one = F.interpolate( - tmp_over_one.transpose(2, 1), - scale_factor=upp, - mode="linear", - align_corners=True, - ).transpose(2, 1) - rad_values = F.interpolate( - rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" - ).transpose( - 2, 1 - ) ####### - tmp_over_one %= 1 - tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 - cumsum_shift = torch.zeros_like(rad_values) - cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 - sine_waves = torch.sin( - torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi - ) - sine_waves = sine_waves * self.sine_amp - uv = self._f02uv(f0) - uv = F.interpolate( - uv.transpose(2, 1), scale_factor=upp, mode="nearest" - ).transpose(2, 1) - noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 - noise = noise_amp * torch.randn_like(sine_waves) - sine_waves = sine_waves * uv + noise - return sine_waves, uv, noise - - -class SourceModuleHnNSF(torch.nn.Module): - """SourceModule for hn-nsf - SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, - add_noise_std=0.003, voiced_threshod=0) - sampling_rate: sampling_rate in Hz - harmonic_num: number of harmonic above F0 (default: 0) - sine_amp: amplitude of sine source signal (default: 0.1) - add_noise_std: std of additive Gaussian noise (default: 0.003) - note that amplitude of noise in unvoiced is decided - by sine_amp - voiced_threshold: threhold to set U/V given F0 (default: 0) - Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) - F0_sampled (batchsize, length, 1) - Sine_source (batchsize, length, 1) - noise_source (batchsize, length 1) - uv (batchsize, length, 1) - """ - - def __init__( - self, - sampling_rate, - gin_channels, - harmonic_num=0, - sine_amp=0.1, - add_noise_std=0.003, - voiced_threshod=0, - is_half=True, - ): - super(SourceModuleHnNSF, self).__init__() - - self.sine_amp = sine_amp - self.noise_std = add_noise_std - self.is_half = is_half - # to produce sine waveforms - self.l_sin_gen = SineGen( - sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod - ) - - # to merge source harmonics into a single excitation - self.l_linear = torch.nn.Conv1d(gin_channels, harmonic_num + 1, 1) - self.l_tanh = torch.nn.Tanh() - - def forward(self, x, upp=None): - sine_wavs, uv, _ = self.l_sin_gen(x, upp) - sine_raw = torch.transpose(sine_wavs, 1, 2).to(device=x.device, dtype=x.dtype) - return sine_raw, None, None # noise, uv - - -class GeneratorNSF(torch.nn.Module): - def __init__( - self, - initial_channel, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels, - sr, - harmonic_num=16, - is_half=False, - ): - super(GeneratorNSF, self).__init__() - self.num_kernels = len(resblock_kernel_sizes) - self.num_upsamples = len(upsample_rates) - self.upsample_rates = upsample_rates - - self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) - self.m_source = SourceModuleHnNSF( - sampling_rate=sr, gin_channels=gin_channels, harmonic_num=harmonic_num, is_half=is_half - ) - self.gpre = Conv1d(gin_channels, initial_channel, 1) - self.conv_pre = ResBlock1(initial_channel, upsample_initial_channel, gin_channels, [7] * 5, [1] * 5, [1, 2, 4, 8, 1], 1, 2) - - self.ups = nn.ModuleList() - self.resblocks = nn.ModuleList() - c_cur = upsample_initial_channel - for i, u in enumerate(upsample_rates): - c_pre = c_cur - c_cur = c_pre // 2 - self.ups.append( - CausalConvTranspose1d( - c_pre, - c_pre, - kernel_rate=3, - stride=u, - groups=c_pre, - ) - ) - self.resblocks.append(ResBlock1(c_pre, c_cur, gin_channels, [11] * 5, [1] * 5, [1, 2, 4, 8, 1], 1, r=2)) - self.conv_post = DilatedCausalConv1d(c_cur, 1, 5, stride=1, groups=1, dilation=1, bias=False) - self.noise_convs = nn.ModuleList() - self.noise_pre = LoRALinear1d(1 + harmonic_num, c_pre, gin_channels, r=2+harmonic_num) - for i, u in enumerate(upsample_rates[::-1]): - c_pre = c_pre * 2 - c_cur = c_cur * 2 - if i + 1 < len(upsample_rates): - self.noise_convs.append(DilatedCausalConv1d(c_cur, c_pre, kernel_size=u*3, stride=u, groups=c_cur, dilation=1)) - else: - self.noise_convs.append(DilatedCausalConv1d(c_cur, initial_channel, kernel_size=u*3, stride=u, groups=math.gcd(c_cur, initial_channel), dilation=1)) - self.upp = np.prod(upsample_rates) - - def forward(self, x, x_mask, f0f, g): - har_source, noi_source, uv = self.m_source(f0f, self.upp) - har_source = self.noise_pre(har_source, g) - x_sources = [har_source] - for c in self.noise_convs: - har_source = c(har_source) - x_sources.append(har_source) - - x = x + x_sources[-1] - x = x + self.gpre(g) - x = self.conv_pre(x, x_mask, g) - for i, u in enumerate(self.upsample_rates): - x_mask = torch.repeat_interleave(x_mask, u, 2) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - x = self.ups[i](x) - x = self.resblocks[i](x + x_sources[-i-2], x_mask, g) - - x = F.leaky_relu(x) - x = self.conv_post(x) - if x_mask is not None: - x *= x_mask - x = torch.tanh(x) - return x - - def remove_weight_norm(self): - for l in self.ups: - remove_weight_norm(l) - for l in self.resblocks: - l.remove_weight_norm() - remove_weight_norm(self.noise_pre) - remove_weight_norm(self.conv_post) - - -sr2sr = { - "32k": 32000, - "40k": 40000, - "48k": 48000, -} - - -class SynthesizerTrnMs256NSFSid(nn.Module): - def __init__( - self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - spk_embed_dim, - gin_channels, - emb_channels, - sr, - **kwargs - ): - super().__init__() - if type(sr) == type("strr"): - sr = sr2sr[sr] - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - self.emb_channels = emb_channels - self.sr = sr - # self.hop_length = hop_length# - self.spk_embed_dim = spk_embed_dim - - self.emb_pitch = nn.Embedding(256, emb_channels) # pitch 256 - self.dec = GeneratorNSF( - emb_channels, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels, - sr=sr, - ) - - self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - print( - "gin_channels:", - gin_channels, - "self.spk_embed_dim:", - self.spk_embed_dim, - "emb_channels:", - emb_channels, - ) - - def remove_weight_norm(self): - self.dec.remove_weight_norm() - - def forward( - self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds - ): # 这里ds是id,[bs,1] - # print(1,pitch.shape)#[bs,t] - g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 - # m_p, _, x_mask = self.enc_p(phone, pitch, phone_lengths, g) - # z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) - # z_p = self.flow(m_p * x_mask, x_mask, g=g) - - x = phone + self.emb_pitch(pitch) - x = torch.transpose(x, 1, -1) # [b, h, t] - x_mask = torch.unsqueeze(commons.sequence_mask(phone_lengths, x.size(2)), 1).to( - phone.dtype - ) - - m_p_slice, ids_slice = commons.rand_slice_segments( - x, phone_lengths, self.segment_size - ) - # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) - pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) - mask_slice = commons.slice_segments(x_mask, ids_slice, self.segment_size) - # print(-2,pitchf.shape,z_slice.shape) - o = self.dec(m_p_slice, mask_slice, pitchf, g) - return o, ids_slice, x_mask, g - - def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): - g = self.emb_g(sid).unsqueeze(-1) - x = phone + self.emb_pitch(pitch) - x = torch.transpose(x, 1, -1) - x_mask = torch.unsqueeze(commons.sequence_mask(phone_lengths, x.size(2)), 1).to( - phone.dtype - ) - o = self.dec((x * x_mask)[:, :, :max_len], x_mask, nsff0, g) - return o, x_mask, (None, None, None, None) - - -class DiscriminatorS(torch.nn.Module): - def __init__( - self, - hidden_channels: int, - filter_channels: int, - gin_channels: int, - n_heads: int, - n_layers: int, - kernel_size: int, - p_dropout: int, - ): - super(DiscriminatorS, self).__init__() - self.convs = WaveConv1D(2, hidden_channels, gin_channels, [10, 7, 7, 7, 5, 3, 3], [5, 4, 4, 4, 3, 2, 2], [1] * 7, hidden_channels // 2, False) - self.encoder = attentions.Encoder( - hidden_channels, filter_channels, gin_channels, n_heads, n_layers//2, kernel_size, p_dropout - ) - self.cross = weight_norm(torch.nn.Conv1d(gin_channels, hidden_channels, 1, 1)) - self.conv_post = weight_norm(torch.nn.Conv1d(hidden_channels, 1, 3, 1, padding=get_padding(5, 1))) - - def forward(self, x, g): - x = self.convs(x) - x_mask = torch.ones([x.shape[0], 1, x.shape[2]], device=x.device, dtype=x.dtype) - x = self.encoder(x, x_mask, g) - fmap = [x] - x = x + x * self.cross(g) - y = self.conv_post(x) - return y, fmap - - -class DiscriminatorP(torch.nn.Module): - def __init__(self, period, gin_channels, upsample_rates, final_dim=256, use_spectral_norm=False): - super(DiscriminatorP, self).__init__() - self.period = period - self.use_spectral_norm = use_spectral_norm - self.init_kernel_size = upsample_rates[-1] * 3 - norm_f = weight_norm if use_spectral_norm == False else spectral_norm - N = len(upsample_rates) - self.init_conv = norm_f(Conv2d(1, final_dim // (2 ** (N - 1)), (self.init_kernel_size, 1), (upsample_rates[-1], 1))) - self.convs = nn.ModuleList() - for i, u in enumerate(upsample_rates[::-1][1:], start=1): - self.convs.append( - ConvNext2d( - final_dim // (2 ** (N - i)), - final_dim // (2 ** (N - i - 1)), - gin_channels, - (u*3, 1), - (u, 1), - 4, - r=2 - ) - ) - self.conv_post = weight_norm(Conv2d(final_dim, 1, (3, 1), (1, 1))) - - def forward(self, x, g): - fmap = [] - - # 1d to 2d - b, c, t = x.shape - if t % self.period != 0: # pad first - n_pad = self.period - (t % self.period) - x = F.pad(x, (n_pad, 0), "reflect") - t = t + n_pad - x = x.view(b, c, t // self.period, self.period) - - x = torch.flip(x, dims=[2]) - x = F.pad(x, [0, 0, 0, self.init_kernel_size - 1], mode="constant") - x = self.init_conv(x) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - x = torch.flip(x, dims=[2]) - - for i, l in enumerate(self.convs): - x = l(x, g) - if i >= 1: - fmap.append(x) - - x = F.pad(x, [0, 0, 2, 0], mode="constant") - x = self.conv_post(x) - x = torch.flatten(x, 1, -1) - - return x, fmap - - -class MultiPeriodDiscriminator(torch.nn.Module): - def __init__(self, upsample_rates, gin_channels, periods=[2, 3, 5, 7, 11, 17], **kwargs): - super(MultiPeriodDiscriminator, self).__init__() - - # discs = [DiscriminatorS(hidden_channels, filter_channels, gin_channels, n_heads, n_layers, kernel_size, p_dropout)] - discs = [ - DiscriminatorP(i, gin_channels, upsample_rates, use_spectral_norm=False) for i in periods - ] - self.ups = np.prod(upsample_rates) - self.discriminators = nn.ModuleList(discs) - - def forward(self, y, y_hat, g): - fmap_rs = [] - fmap_gs = [] - y_d_rs = [] - y_d_gs = [] - for i, d in enumerate(self.discriminators): - y_d_r, fmap_r = d(y, g) - y_d_g, fmap_g = d(y_hat, g) - # for j in range(len(fmap_r)): - # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) - y_d_rs.append(y_d_r) - y_d_gs.append(y_d_g) - fmap_rs.append(fmap_r) - fmap_gs.append(fmap_g) - - return y_d_rs, y_d_gs, fmap_rs, fmap_gs diff --git a/server/voice_changer/RVC/inferencer/model_v3/modules.py b/server/voice_changer/RVC/inferencer/model_v3/modules.py deleted file mode 100644 index fb0db86d..00000000 --- a/server/voice_changer/RVC/inferencer/model_v3/modules.py +++ /dev/null @@ -1,626 +0,0 @@ -import math - -import torch -from torch import nn -from torch.nn import Conv1d, Conv2d -from torch.nn import functional as F -from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm - -from . import commons, modules -from .commons import get_padding, init_weights -from .transforms import piecewise_rational_quadratic_transform - -LRELU_SLOPE = 0.1 - - -class LayerNorm(nn.Module): - def __init__(self, channels, eps=1e-5): - super().__init__() - self.channels = channels - self.eps = eps - - self.gamma = nn.Parameter(torch.ones(channels)) - self.beta = nn.Parameter(torch.zeros(channels)) - - def forward(self, x): - x = x.transpose(1, -1) - x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) - return x.transpose(1, -1) - - -class ConvReluNorm(nn.Module): - def __init__( - self, - in_channels, - hidden_channels, - out_channels, - kernel_size, - n_layers, - p_dropout, - ): - super().__init__() - self.in_channels = in_channels - self.hidden_channels = hidden_channels - self.out_channels = out_channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.p_dropout = p_dropout - assert n_layers > 1, "Number of layers should be larger than 0." - - self.conv_layers = nn.ModuleList() - self.norm_layers = nn.ModuleList() - self.conv_layers.append( - nn.Conv1d( - in_channels, hidden_channels, kernel_size, padding=kernel_size // 2 - ) - ) - self.norm_layers.append(LayerNorm(hidden_channels)) - self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout)) - for _ in range(n_layers - 1): - self.conv_layers.append( - nn.Conv1d( - hidden_channels, - hidden_channels, - kernel_size, - padding=kernel_size // 2, - ) - ) - self.norm_layers.append(LayerNorm(hidden_channels)) - self.proj = nn.Conv1d(hidden_channels, out_channels, 1) - self.proj.weight.data.zero_() - self.proj.bias.data.zero_() - - def forward(self, x, x_mask): - x_org = x - for i in range(self.n_layers): - x = self.conv_layers[i](x * x_mask) - x = self.norm_layers[i](x) - x = self.relu_drop(x) - x = x_org + self.proj(x) - return x * x_mask - - -class DDSConv(nn.Module): - """ - Dialted and Depth-Separable Convolution - """ - - def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0): - super().__init__() - self.channels = channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.p_dropout = p_dropout - - self.drop = nn.Dropout(p_dropout) - self.convs_sep = nn.ModuleList() - self.convs_1x1 = nn.ModuleList() - self.norms_1 = nn.ModuleList() - self.norms_2 = nn.ModuleList() - for i in range(n_layers): - dilation = kernel_size**i - padding = (kernel_size * dilation - dilation) // 2 - self.convs_sep.append( - nn.Conv1d( - channels, - channels, - kernel_size, - groups=channels, - dilation=dilation, - padding=padding, - ) - ) - self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) - self.norms_1.append(LayerNorm(channels)) - self.norms_2.append(LayerNorm(channels)) - - def forward(self, x, x_mask, g=None): - if g is not None: - x = x + g - for i in range(self.n_layers): - y = self.convs_sep[i](x * x_mask) - y = self.norms_1[i](y) - y = F.gelu(y) - y = self.convs_1x1[i](y) - y = self.norms_2[i](y) - y = F.gelu(y) - y = self.drop(y) - x = x + y - return x * x_mask - - -class WN(torch.nn.Module): - def __init__( - self, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - gin_channels=0, - p_dropout=0, - ): - super(WN, self).__init__() - assert kernel_size % 2 == 1 - self.hidden_channels = hidden_channels - self.kernel_size = (kernel_size,) - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.gin_channels = gin_channels - self.p_dropout = p_dropout - - self.in_layers = torch.nn.ModuleList() - self.res_skip_layers = torch.nn.ModuleList() - self.drop = nn.Dropout(p_dropout) - - if gin_channels != 0: - cond_layer = torch.nn.Conv1d( - gin_channels, 2 * hidden_channels * n_layers, 1 - ) - self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") - - for i in range(n_layers): - dilation = dilation_rate**i - padding = int((kernel_size * dilation - dilation) / 2) - in_layer = torch.nn.Conv1d( - hidden_channels, - 2 * hidden_channels, - kernel_size, - dilation=dilation, - padding=padding, - ) - in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") - self.in_layers.append(in_layer) - - # last one is not necessary - if i < n_layers - 1: - res_skip_channels = 2 * hidden_channels - else: - res_skip_channels = hidden_channels - - res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) - res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") - self.res_skip_layers.append(res_skip_layer) - - def forward(self, x, x_mask, g=None, **kwargs): - output = torch.zeros_like(x) - n_channels_tensor = torch.IntTensor([self.hidden_channels]) - - if g is not None: - g = self.cond_layer(g) - - for i in range(self.n_layers): - x_in = self.in_layers[i](x) - if g is not None: - cond_offset = i * 2 * self.hidden_channels - g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] - else: - g_l = torch.zeros_like(x_in) - - acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) - acts = self.drop(acts) - - res_skip_acts = self.res_skip_layers[i](acts) - if i < self.n_layers - 1: - res_acts = res_skip_acts[:, : self.hidden_channels, :] - x = (x + res_acts) * x_mask - output = output + res_skip_acts[:, self.hidden_channels :, :] - else: - output = output + res_skip_acts - return output * x_mask - - def remove_weight_norm(self): - if self.gin_channels != 0: - torch.nn.utils.remove_weight_norm(self.cond_layer) - for l in self.in_layers: - torch.nn.utils.remove_weight_norm(l) - for l in self.res_skip_layers: - torch.nn.utils.remove_weight_norm(l) - - -class DilatedCausalConv1d(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, groups=1, dilation=1, bias=True): - super(DilatedCausalConv1d, self).__init__() - self.kernel_size = kernel_size - self.dilation = dilation - self.stride = stride - self.conv = weight_norm(nn.Conv1d(in_channels, out_channels, kernel_size, stride=stride, groups=groups, dilation=dilation, bias=bias)) - init_weights(self.conv) - - def forward(self, x): - x = torch.flip(x, [2]) - x = F.pad(x, [0, (self.kernel_size - 1) * self.dilation], mode="constant", value=0.) - size = x.shape[2] // self.stride - x = self.conv(x)[:, :, :size] - x = torch.flip(x, [2]) - return x - - def remove_weight_norm(self): - remove_weight_norm(self.conv) - - -class CausalConvTranspose1d(nn.Module): - """ - padding = 0, dilation = 1のとき - - Lout = (Lin - 1) * stride + kernel_rate * stride + output_padding - Lout = Lin * stride + (kernel_rate - 1) * stride + output_padding - output_paddingいらないね - """ - def __init__(self, in_channels, out_channels, kernel_rate=3, stride=1, groups=1): - super(CausalConvTranspose1d, self).__init__() - kernel_size = kernel_rate * stride - self.trim_size = (kernel_rate - 1) * stride - self.conv = weight_norm(nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride=stride, groups=groups)) - - def forward(self, x): - x = self.conv(x) - return x[:, :, :-self.trim_size] - - def remove_weight_norm(self): - remove_weight_norm(self.conv) - - -class LoRALinear1d(nn.Module): - def __init__(self, in_channels, out_channels, info_channels, r): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.info_channels = info_channels - self.r = r - self.main_fc = weight_norm(nn.Conv1d(in_channels, out_channels, 1)) - self.adapter_in = nn.Conv1d(info_channels, in_channels * r, 1) - self.adapter_out = nn.Conv1d(info_channels, out_channels * r, 1) - nn.init.normal_(self.adapter_in.weight.data, 0, 0.01) - nn.init.constant_(self.adapter_out.weight.data, 1e-6) - init_weights(self.main_fc) - self.adapter_in = weight_norm(self.adapter_in) - self.adapter_out = weight_norm(self.adapter_out) - - def forward(self, x, g): - a_in = self.adapter_in(g).view(-1, self.in_channels, self.r) - a_out = self.adapter_out(g).view(-1, self.r, self.out_channels) - x = self.main_fc(x) + torch.einsum("brl,brc->bcl", torch.einsum("bcl,bcr->brl", x, a_in), a_out) - return x - - def remove_weight_norm(self): - remove_weight_norm(self.main_fc) - remove_weight_norm(self.adapter_in) - remove_weight_norm(self.adapter_out) - - -class LoRALinear2d(nn.Module): - def __init__(self, in_channels, out_channels, info_channels, r): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.info_channels = info_channels - self.r = r - self.main_fc = weight_norm(nn.Conv2d(in_channels, out_channels, (1, 1), (1, 1))) - self.adapter_in = nn.Conv1d(info_channels, in_channels * r, 1) - self.adapter_out = nn.Conv1d(info_channels, out_channels * r, 1) - nn.init.normal_(self.adapter_in.weight.data, 0, 0.01) - nn.init.constant_(self.adapter_out.weight.data, 1e-6) - self.adapter_in = weight_norm(self.adapter_in) - self.adapter_out = weight_norm(self.adapter_out) - - def forward(self, x, g): - a_in = self.adapter_in(g).view(-1, self.in_channels, self.r) - a_out = self.adapter_out(g).view(-1, self.r, self.out_channels) - x = self.main_fc(x) + torch.einsum("brhw,brc->bchw", torch.einsum("bchw,bcr->brhw", x, a_in), a_out) - return x - - def remove_weight_norm(self): - remove_weight_norm(self.main_fc) - remove_weight_norm(self.adapter_in) - remove_weight_norm(self.adapter_out) - - -class WaveConv1D(torch.nn.Module): - def __init__(self, in_channels, out_channels, gin_channels, kernel_sizes, strides, dilations, extend_ratio, r, use_spectral_norm=False): - super(WaveConv1D, self).__init__() - norm_f = weight_norm if use_spectral_norm == False else spectral_norm - inner_channels = int(in_channels * extend_ratio) - self.convs = [] - # self.norms = [] - self.convs.append(LoRALinear1d(in_channels, inner_channels, gin_channels, r)) - for i, (k, s, d) in enumerate(zip(kernel_sizes, strides, dilations), start=1): - self.convs.append(norm_f(Conv1d(inner_channels, inner_channels, k, s, dilation=d, groups=inner_channels, padding=get_padding(k, d)))) - if i < len(kernel_sizes): - self.convs.append(norm_f(Conv1d(inner_channels, inner_channels, 1, 1))) - else: - self.convs.append(norm_f(Conv1d(inner_channels, out_channels, 1, 1))) - self.convs = nn.ModuleList(self.convs) - - def forward(self, x, g, x_mask=None): - for i, l in enumerate(self.convs): - if i % 2: - x_ = l(x) - else: - x_ = l(x, g) - x = F.leaky_relu(x_, modules.LRELU_SLOPE) - if x_mask is not None: - x *= x_mask - return x - - def remove_weight_norm(self): - for i, c in enumerate(self.convs): - if i % 2: - remove_weight_norm(c) - else: - c.remove_weight_norm() - - -class MBConv2d(torch.nn.Module): - """ - Causal MBConv2D - """ - def __init__(self, in_channels, out_channels, gin_channels, kernel_size, stride, extend_ratio, r, use_spectral_norm=False): - super(MBConv2d, self).__init__() - norm_f = weight_norm if use_spectral_norm == False else spectral_norm - inner_channels = int(in_channels * extend_ratio) - self.kernel_size = kernel_size - self.pre_pointwise = LoRALinear2d(in_channels, inner_channels, gin_channels, r=r) - self.depthwise = norm_f(Conv2d(inner_channels, inner_channels, kernel_size, stride, groups=inner_channels)) - self.post_pointwise = LoRALinear2d(inner_channels, out_channels, gin_channels, r=r) - - def forward(self, x, g): - x = self.pre_pointwise(x, g) - x = F.pad(x, [0, 0, self.kernel_size[0] - 1, 0], mode="constant") - x = self.depthwise(x) - x = self.post_pointwise(x, g) - return x - - -class ConvNext2d(torch.nn.Module): - """ - Causal ConvNext Block - stride = 1 only - """ - def __init__(self, in_channels, out_channels, gin_channels, kernel_size, stride, extend_ratio, r, use_spectral_norm=False): - super(ConvNext2d, self).__init__() - norm_f = weight_norm if use_spectral_norm == False else spectral_norm - inner_channels = int(in_channels * extend_ratio) - self.kernel_size = kernel_size - self.dwconv = norm_f(Conv2d(in_channels, in_channels, kernel_size, stride, groups=in_channels)) - self.pwconv1 = LoRALinear2d(in_channels, inner_channels, gin_channels, r=r) - self.pwconv2 = LoRALinear2d(inner_channels, out_channels, gin_channels, r=r) - self.act = nn.GELU() - self.norm = LayerNorm(in_channels) - - def forward(self, x, g): - x = F.pad(x, [0, 0, self.kernel_size[0] - 1, 0], mode="constant") - x = self.dwconv(x) - x = self.norm(x) - x = self.pwconv1(x, g) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - x = self.pwconv2(x, g) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - return x - - def remove_weight_norm(self): - remove_weight_norm(self.dwconv) - - -class SqueezeExcitation1D(torch.nn.Module): - def __init__(self, input_channels, squeeze_channels, gin_channels, use_spectral_norm=False): - norm_f = weight_norm if use_spectral_norm == False else spectral_norm - super(SqueezeExcitation1D, self).__init__() - self.fc1 = LoRALinear1d(input_channels, squeeze_channels, gin_channels, 2) - self.fc2 = LoRALinear1d(squeeze_channels, input_channels, gin_channels, 2) - - def _scale(self, x, x_mask, g): - x_length = torch.sum(x_mask, dim=2, keepdim=True) - x_length = torch.maximum(x_length, torch.ones_like(x_length)) - scale = torch.sum(x * x_mask, dim=2, keepdim=True) / x_length - scale = self.fc1(scale, g) - scale = F.leaky_relu(scale, modules.LRELU_SLOPE) - scale = self.fc2(scale, g) - return torch.sigmoid(scale) - - def forward(self, x, x_mask, g): - scale = self._scale(x, x_mask, g) - return scale * x - - def remove_weight_norm(self): - self.fc1.remove_weight_norm() - self.fc2.remove_weight_norm() - - -class ResBlock1(torch.nn.Module): - def __init__(self, in_channels, out_channels, gin_channels, kernel_sizes, strides, dilations, extend_ratio, r): - super(ResBlock1, self).__init__() - norm_f = weight_norm - inner_channels = int(in_channels * extend_ratio) - self.dconvs = nn.ModuleList() - self.pconvs = nn.ModuleList() - # self.ses = nn.ModuleList() - self.norms = nn.ModuleList() - self.init_conv = LoRALinear1d(in_channels, inner_channels, gin_channels, r) - for i, (k, s, d) in enumerate(zip(kernel_sizes, strides, dilations)): - self.norms.append(LayerNorm(inner_channels)) - self.dconvs.append(DilatedCausalConv1d(inner_channels, inner_channels, k, stride=s, dilation=d, groups=inner_channels)) - if i < len(kernel_sizes) - 1: - self.pconvs.append(LoRALinear1d(inner_channels, inner_channels, gin_channels, r)) - self.out_conv = LoRALinear1d(inner_channels, out_channels, gin_channels, r) - init_weights(self.init_conv) - init_weights(self.out_conv) - - def forward(self, x, x_mask, g): - x *= x_mask - x = self.init_conv(x, g) - for i in range(len(self.dconvs)): - x *= x_mask - x = self.norms[i](x) - x_ = self.dconvs[i](x) - x_ = F.leaky_relu(x_, modules.LRELU_SLOPE) - if i < len(self.dconvs) - 1: - x = x + self.pconvs[i](x_, g) - x = self.out_conv(x_, g) - return x - - def remove_weight_norm(self): - for c in self.dconvs: - c.remove_weight_norm() - for c in self.pconvs: - c.remove_weight_norm() - self.init_conv.remove_weight_norm() - self.out_conv.remove_weight_norm() - - -class Log(nn.Module): - def forward(self, x, x_mask, reverse=False, **kwargs): - if not reverse: - y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask - logdet = torch.sum(-y, [1, 2]) - return y, logdet - else: - x = torch.exp(x) * x_mask - return x - - -class Flip(nn.Module): - def forward(self, x, *args, reverse=False, **kwargs): - x = torch.flip(x, [1]) - if not reverse: - logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) - return x, logdet - else: - return x - - -class ElementwiseAffine(nn.Module): - def __init__(self, channels): - super().__init__() - self.channels = channels - self.m = nn.Parameter(torch.zeros(channels, 1)) - self.logs = nn.Parameter(torch.zeros(channels, 1)) - - def forward(self, x, x_mask, reverse=False, **kwargs): - if not reverse: - y = self.m + torch.exp(self.logs) * x - y = y * x_mask - logdet = torch.sum(self.logs * x_mask, [1, 2]) - return y, logdet - else: - x = (x - self.m) * torch.exp(-self.logs) * x_mask - return x - - -class ResidualCouplingLayer(nn.Module): - def __init__( - self, - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - p_dropout=0, - gin_channels=0, - mean_only=False, - ): - assert channels % 2 == 0, "channels should be divisible by 2" - super().__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.half_channels = channels // 2 - self.mean_only = mean_only - - self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) - self.enc = WN( - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - p_dropout=p_dropout, - gin_channels=gin_channels, - ) - self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) - self.post.weight.data.zero_() - self.post.bias.data.zero_() - - def forward(self, x, x_mask, g=None, reverse=False): - x0, x1 = torch.split(x, [self.half_channels] * 2, 1) - h = self.pre(x0) * x_mask - h = self.enc(h, x_mask, g=g) - stats = self.post(h) * x_mask - if not self.mean_only: - m, logs = torch.split(stats, [self.half_channels] * 2, 1) - else: - m = stats - logs = torch.zeros_like(m) - - if not reverse: - x1 = m + x1 * torch.exp(logs) * x_mask - x = torch.cat([x0, x1], 1) - logdet = torch.sum(logs, [1, 2]) - return x, logdet - else: - x1 = (x1 - m) * torch.exp(-logs) * x_mask - x = torch.cat([x0, x1], 1) - return x - - def remove_weight_norm(self): - self.enc.remove_weight_norm() - - -class ConvFlow(nn.Module): - def __init__( - self, - in_channels, - filter_channels, - kernel_size, - n_layers, - num_bins=10, - tail_bound=5.0, - ): - super().__init__() - self.in_channels = in_channels - self.filter_channels = filter_channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.num_bins = num_bins - self.tail_bound = tail_bound - self.half_channels = in_channels // 2 - - self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) - self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0) - self.proj = nn.Conv1d( - filter_channels, self.half_channels * (num_bins * 3 - 1), 1 - ) - self.proj.weight.data.zero_() - self.proj.bias.data.zero_() - - def forward(self, x, x_mask, g=None, reverse=False): - x0, x1 = torch.split(x, [self.half_channels] * 2, 1) - h = self.pre(x0) - h = self.convs(h, x_mask, g=g) - h = self.proj(h) * x_mask - - b, c, t = x0.shape - h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] - - unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels) - unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt( - self.filter_channels - ) - unnormalized_derivatives = h[..., 2 * self.num_bins :] - - x1, logabsdet = piecewise_rational_quadratic_transform( - x1, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=reverse, - tails="linear", - tail_bound=self.tail_bound, - ) - - x = torch.cat([x0, x1], 1) * x_mask - logdet = torch.sum(logabsdet * x_mask, [1, 2]) - if not reverse: - return x, logdet - else: - return x diff --git a/server/voice_changer/RVC/inferencer/model_v3/commons.py b/server/voice_changer/RVC/inferencer/voras_beta/commons.py similarity index 100% rename from server/voice_changer/RVC/inferencer/model_v3/commons.py rename to server/voice_changer/RVC/inferencer/voras_beta/commons.py diff --git a/server/voice_changer/RVC/inferencer/model_v3/config.py b/server/voice_changer/RVC/inferencer/voras_beta/config.py similarity index 77% rename from server/voice_changer/RVC/inferencer/model_v3/config.py rename to server/voice_changer/RVC/inferencer/voras_beta/config.py index 6a0dc817..ddfb4271 100644 --- a/server/voice_changer/RVC/inferencer/model_v3/config.py +++ b/server/voice_changer/RVC/inferencer/voras_beta/config.py @@ -32,27 +32,17 @@ class TrainConfigData(BaseModel): class TrainConfigModel(BaseModel): + emb_channels: int inter_channels: int - hidden_channels: int - filter_channels: int - n_heads: int n_layers: int - kernel_size: int - p_dropout: int - resblock: str - resblock_kernel_sizes: List[int] - resblock_dilation_sizes: List[List[int]] upsample_rates: List[int] - upsample_initial_channel: int - upsample_kernel_sizes: List[int] use_spectral_norm: bool gin_channels: int - emb_channels: int spk_embed_dim: int class TrainConfig(BaseModel): - version: Literal["v1", "v2"] = "v2" + version: Literal["voras"] = "voras" train: TrainConfigTrain data: TrainConfigData model: TrainConfigModel diff --git a/server/voice_changer/RVC/inferencer/voras_beta/models.py b/server/voice_changer/RVC/inferencer/voras_beta/models.py new file mode 100644 index 00000000..3168e590 --- /dev/null +++ b/server/voice_changer/RVC/inferencer/voras_beta/models.py @@ -0,0 +1,238 @@ +import math +import os +import sys + +import numpy as np +import torch +from torch import nn +from torch.nn import Conv2d +from torch.nn import functional as F +from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm + +from . import commons, modules +from .commons import get_padding +from .modules import (ConvNext2d, HarmonicEmbedder, IMDCTSymExpHead, + LoRALinear1d, SnakeFilter, WaveBlock) + +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) + +sr2sr = { + "24k": 24000, + "32k": 32000, + "40k": 40000, + "48k": 48000, +} + +class GeneratorVoras(torch.nn.Module): + def __init__( + self, + emb_channels, + inter_channels, + gin_channels, + n_layers, + sr, + hop_length, + ): + super(GeneratorVoras, self).__init__() + self.n_layers = n_layers + self.emb_pitch = HarmonicEmbedder(768, inter_channels, gin_channels, 16, 15) # # pitch 256 + self.plinear = LoRALinear1d(inter_channels, inter_channels, gin_channels, r=8) + self.glinear = weight_norm(nn.Conv1d(gin_channels, inter_channels, 1)) + self.resblocks = nn.ModuleList() + self.init_linear = LoRALinear1d(emb_channels, inter_channels, gin_channels, r=4) + for _ in range(self.n_layers): + self.resblocks.append(WaveBlock(inter_channels, gin_channels, [9] * 2, [1] * 2, [1, 9], 2, r=4)) + self.head = IMDCTSymExpHead(inter_channels, gin_channels, hop_length, padding="center", sample_rate=sr) + self.post = SnakeFilter(4, 8, 9, 2, eps=1e-5) + + def forward(self, x, pitchf, x_mask, g): + x = self.init_linear(x, g) + self.plinear(self.emb_pitch(pitchf, g), g) + self.glinear(g) + for i in range(self.n_layers): + x = self.resblocks[i](x, x_mask, g) + x = x * x_mask + x = self.head(x, g) + x = self.post(x) + return torch.tanh(x) + + def remove_weight_norm(self): + self.plinear.remove_weight_norm() + remove_weight_norm(self.glinear) + for l in self.resblocks: + l.remove_weight_norm() + self.init_linear.remove_weight_norm() + self.head.remove_weight_norm() + self.post.remove_weight_norm() + + def fix_speaker(self, g): + self.plinear.fix_speaker(g) + self.init_linear.fix_speaker(g) + for l in self.resblocks: + l.fix_speaker(g) + self.head.fix_speaker(g) + + def unfix_speaker(self, g): + self.plinear.unfix_speaker(g) + self.init_linear.unfix_speaker(g) + for l in self.resblocks: + l.unfix_speaker(g) + self.head.unfix_speaker(g) + + +class Synthesizer(nn.Module): + def __init__( + self, + segment_size, + n_fft, + hop_length, + inter_channels, + n_layers, + spk_embed_dim, + gin_channels, + emb_channels, + sr, + **kwargs + ): + super().__init__() + if type(sr) == type("strr"): + sr = sr2sr[sr] + self.segment_size = segment_size + self.n_fft = n_fft + self.hop_length = hop_length + self.inter_channels = inter_channels + self.n_layers = n_layers + self.spk_embed_dim = spk_embed_dim + self.gin_channels = gin_channels + self.emb_channels = emb_channels + self.sr = sr + + self.dec = GeneratorVoras( + emb_channels, + inter_channels, + gin_channels, + n_layers, + sr, + hop_length + ) + + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print( + "gin_channels:", + gin_channels, + "self.spk_embed_dim:", + self.spk_embed_dim, + "emb_channels:", + emb_channels, + ) + self.speaker = None + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + + def change_speaker(self, sid: int): + if self.speaker is not None: + g = self.emb_g(torch.from_numpy(np.array(self.speaker))).unsqueeze(-1) + self.dec.unfix_speaker(g) + g = self.emb_g(torch.from_numpy(np.array(sid))).unsqueeze(-1) + self.dec.fix_speaker(g) + self.speaker = sid + + def forward( + self, phone, phone_lengths, pitch, pitchf, ds + ): + g = self.emb_g(ds).unsqueeze(-1) + x = torch.transpose(phone, 1, -1) + x_mask = torch.unsqueeze(commons.sequence_mask(phone_lengths, x.size(2)), 1).to(phone.dtype) + x_slice, ids_slice = commons.rand_slice_segments( + x, phone_lengths, self.segment_size + ) + pitchf_slice = commons.slice_segments2(pitchf, ids_slice, self.segment_size) + mask_slice = commons.slice_segments(x_mask, ids_slice, self.segment_size) + o = self.dec(x_slice, pitchf_slice, mask_slice, g) + return o, ids_slice, x_mask, g + + def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): + g = self.emb_g(sid).unsqueeze(-1) + x = torch.transpose(phone, 1, -1) + x_mask = torch.unsqueeze(commons.sequence_mask(phone_lengths, x.size(2)), 1).to(phone.dtype) + o = self.dec((x * x_mask)[:, :, :max_len], nsff0, x_mask, g) + return o, x_mask, (None, None, None, None) + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, gin_channels, upsample_rates, final_dim=256, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + self.init_kernel_size = upsample_rates[-1] * 3 + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + N = len(upsample_rates) + self.init_conv = norm_f(Conv2d(1, final_dim // (2 ** (N - 1)), (self.init_kernel_size, 1), (upsample_rates[-1], 1))) + self.convs = nn.ModuleList() + for i, u in enumerate(upsample_rates[::-1][1:], start=1): + self.convs.append( + ConvNext2d( + final_dim // (2 ** (N - i)), + final_dim // (2 ** (N - i - 1)), + gin_channels, + (u*3, 1), + (u, 1), + 4, + r=2 + i//2 + ) + ) + self.conv_post = weight_norm(Conv2d(final_dim, 1, (3, 1), (1, 1))) + + def forward(self, x, g): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (n_pad, 0), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + x = torch.flip(x, dims=[2]) + x = F.pad(x, [0, 0, 0, self.init_kernel_size - 1], mode="constant") + x = self.init_conv(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = torch.flip(x, dims=[2]) + fmap.append(x) + + for i, l in enumerate(self.convs): + x = l(x, g) + fmap.append(x) + + x = F.pad(x, [0, 0, 2, 0], mode="constant") + x = self.conv_post(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, upsample_rates, gin_channels, periods=[2, 3, 5, 7, 11, 17], **kwargs): + super(MultiPeriodDiscriminator, self).__init__() + + discs = [ + DiscriminatorP(i, gin_channels, upsample_rates, use_spectral_norm=False) for i in periods + ] + self.ups = np.prod(upsample_rates) + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat, g): + fmap_rs = [] + fmap_gs = [] + y_d_rs = [] + y_d_gs = [] + for d in self.discriminators: + y_d_r, fmap_r = d(y, g) + y_d_g, fmap_g = d(y_hat, g) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs diff --git a/server/voice_changer/RVC/inferencer/voras_beta/modules.py b/server/voice_changer/RVC/inferencer/voras_beta/modules.py new file mode 100644 index 00000000..f6659c68 --- /dev/null +++ b/server/voice_changer/RVC/inferencer/voras_beta/modules.py @@ -0,0 +1,496 @@ +import math + +import numpy as np +import scipy +import torch +from torch import nn +from torch.nn import Conv1d, Conv2d +from torch.nn import functional as F +from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm +from torchaudio.functional.functional import _hz_to_mel, _mel_to_hz + +from . import commons, modules +from .commons import get_padding, init_weights +from .transforms import piecewise_rational_quadratic_transform + +LRELU_SLOPE = 0.1 + +class HarmonicEmbedder(nn.Module): + def __init__(self, num_embeddings, embedding_dim, gin_channels, num_head, num_harmonic=0, f0_min=50., f0_max=1100., device="cuda"): + super(HarmonicEmbedder, self).__init__() + self.embedding_dim = embedding_dim + self.num_head = num_head + self.num_harmonic = num_harmonic + + f0_mel_min = np.log(1 + f0_min / 700) + f0_mel_max = np.log(1 + f0_max * (1 + num_harmonic) / 700) + self.sequence = torch.from_numpy(np.linspace(f0_mel_min, f0_mel_max, num_embeddings-2)) + self.emb_layer = torch.nn.Embedding(num_embeddings, embedding_dim) + self.linear_q = Conv1d(gin_channels, num_head * (1 + num_harmonic), 1) + self.weight = None + + def forward(self, x, g): + b, l = x.size() + non_zero = (x != 0.).to(dtype=torch.long).unsqueeze(1) + mel = torch.log(1 + x / 700).unsqueeze(1) + harmonies = torch.arange(1 + self.num_harmonic, device=x.device, dtype=x.dtype).view(1, 1 + self.num_harmonic, 1) + 1. + ix = torch.searchsorted(self.sequence.to(x.device), mel * harmonies).to(x.device) + 1 + ix = ix * non_zero + emb = self.emb_layer(ix).transpose(1, 3).reshape(b, self.num_head, self.embedding_dim // self.num_head, 1 + self.num_harmonic, l) + if self.weight is None: + weight = torch.nn.functional.softmax(self.linear_q(g).reshape(b, self.num_head, 1, 1 + self.num_harmonic, 1), 3) + else: + weight = self.weight + res = torch.sum(emb * weight, dim=3).reshape(b, self.embedding_dim, l) + return res + + def fix_speaker(self, g): + self.weight = torch.nn.functional.softmax(self.linear_q(g).reshape(1, self.num_head, 1, 1 + self.num_harmonic, 1), 3) + + def unfix_speaker(self, g): + self.weight = None + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-5): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + x = x.transpose(1, -1) + x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) + return x.transpose(1, -1) + + +class DilatedCausalConv1d(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, groups=1, dilation=1, bias=True): + super(DilatedCausalConv1d, self).__init__() + self.kernel_size = kernel_size + self.dilation = dilation + self.stride = stride + self.conv = weight_norm(nn.Conv1d(in_channels, out_channels, kernel_size, stride=stride, groups=groups, dilation=dilation, bias=bias)) + + def forward(self, x): + x = torch.flip(x, [2]) + x = F.pad(x, [0, (self.kernel_size - 1) * self.dilation], mode="constant", value=0.) + size = x.shape[2] // self.stride + x = self.conv(x)[:, :, :size] + x = torch.flip(x, [2]) + return x + + def remove_weight_norm(self): + remove_weight_norm(self.conv) + + +class CausalConvTranspose1d(nn.Module): + """ + padding = 0, dilation = 1のとき + + Lout = (Lin - 1) * stride + kernel_rate * stride + output_padding + Lout = Lin * stride + (kernel_rate - 1) * stride + output_padding + output_paddingいらないね + """ + def __init__(self, in_channels, out_channels, kernel_rate=3, stride=1, groups=1): + super(CausalConvTranspose1d, self).__init__() + kernel_size = kernel_rate * stride + self.trim_size = (kernel_rate - 1) * stride + self.conv = weight_norm(nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride=stride, groups=groups)) + + def forward(self, x): + x = self.conv(x) + return x[:, :, :-self.trim_size] + + def remove_weight_norm(self): + remove_weight_norm(self.conv) + + +class LoRALinear1d(nn.Module): + def __init__(self, in_channels, out_channels, info_channels, r): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.info_channels = info_channels + self.r = r + self.main_fc = weight_norm(nn.Conv1d(in_channels, out_channels, 1)) + self.adapter_in = nn.Conv1d(info_channels, in_channels * r, 1) + self.adapter_out = nn.Conv1d(info_channels, out_channels * r, 1) + nn.init.normal_(self.adapter_in.weight.data, 0, 0.01) + nn.init.constant_(self.adapter_out.weight.data, 1e-6) + self.adapter_in = weight_norm(self.adapter_in) + self.adapter_out = weight_norm(self.adapter_out) + self.speaker_fixed = False + + def forward(self, x, g): + x_ = self.main_fc(x) + if not self.speaker_fixed: + a_in = self.adapter_in(g).view(-1, self.in_channels, self.r) + a_out = self.adapter_out(g).view(-1, self.r, self.out_channels) + l = torch.einsum("brl,brc->bcl", torch.einsum("bcl,bcr->brl", x, a_in), a_out) + x_ = x_ + l + return x_ + + def remove_weight_norm(self): + remove_weight_norm(self.main_fc) + remove_weight_norm(self.adapter_in) + remove_weight_norm(self.adapter_out) + + def fix_speaker(self, g): + self.speaker_fixed = True + a_in = self.adapter_in(g).view(-1, self.in_channels, self.r) + a_out = self.adapter_out(g).view(-1, self.r, self.out_channels) + weight = torch.einsum("bir,bro->oi", a_in, a_out).unsqueeze(2) + self.main_fc.weight.data.add_(weight) + + def unfix_speaker(self, g): + self.speaker_fixed = False + a_in = self.adapter_in(g).view(-1, self.in_channels, self.r) + a_out = self.adapter_out(g).view(-1, self.r, self.out_channels) + weight = torch.einsum("bir,bro->oi", a_in, a_out).unsqueeze(2) + self.main_fc.weight.data.sub_(weight) + + +class LoRALinear2d(nn.Module): + def __init__(self, in_channels, out_channels, info_channels, r): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.info_channels = info_channels + self.r = r + self.main_fc = weight_norm(nn.Conv2d(in_channels, out_channels, (1, 1), (1, 1))) + self.adapter_in = nn.Conv1d(info_channels, in_channels * r, 1) + self.adapter_out = nn.Conv1d(info_channels, out_channels * r, 1) + nn.init.normal_(self.adapter_in.weight.data, 0, 0.01) + nn.init.constant_(self.adapter_out.weight.data, 1e-6) + self.adapter_in = weight_norm(self.adapter_in) + self.adapter_out = weight_norm(self.adapter_out) + self.speaker_fixed = False + + def forward(self, x, g): + x_ = self.main_fc(x) + if not self.speaker_fixed: + a_in = self.adapter_in(g).view(-1, self.in_channels, self.r) + a_out = self.adapter_out(g).view(-1, self.r, self.out_channels) + l = torch.einsum("brhw,brc->bchw", torch.einsum("bchw,bcr->brhw", x, a_in), a_out) + x_ = x_ + l + return x_ + + def remove_weight_norm(self): + remove_weight_norm(self.main_fc) + remove_weight_norm(self.adapter_in) + remove_weight_norm(self.adapter_out) + + def fix_speaker(self, g): + a_in = self.adapter_in(g).view(-1, self.in_channels, self.r) + a_out = self.adapter_out(g).view(-1, self.r, self.out_channels) + weight = torch.einsum("bir,bro->oi", a_in, a_out).unsqueeze(2).unsqueeze(3) + self.main_fc.weight.data.add_(weight) + + def unfix_speaker(self, g): + a_in = self.adapter_in(g).view(-1, self.in_channels, self.r) + a_out = self.adapter_out(g).view(-1, self.r, self.out_channels) + weight = torch.einsum("bir,bro->oi", a_in, a_out).unsqueeze(2).unsqueeze(3) + self.main_fc.weight.data.sub_(weight) + + +class MBConv2d(torch.nn.Module): + """ + Causal MBConv2D + """ + def __init__(self, in_channels, out_channels, gin_channels, kernel_size, stride, extend_ratio, r, use_spectral_norm=False): + super(MBConv2d, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + inner_channels = int(in_channels * extend_ratio) + self.kernel_size = kernel_size + self.pwconv1 = LoRALinear2d(in_channels, inner_channels, gin_channels, r=r) + self.dwconv = norm_f(Conv2d(inner_channels, inner_channels, kernel_size, stride, groups=inner_channels)) + self.pwconv2 = LoRALinear2d(inner_channels, out_channels, gin_channels, r=r) + self.pwnorm = LayerNorm(in_channels) + self.dwnorm = LayerNorm(inner_channels) + + def forward(self, x, g): + x = self.pwnorm(x) + x = self.pwconv1(x, g) + x = F.pad(x, [0, 0, self.kernel_size[0] - 1, 0], mode="constant") + x = self.dwnorm(x) + x = self.dwconv(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.pwconv2(x, g) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + return x + +class ConvNext2d(torch.nn.Module): + """ + Causal ConvNext Block + stride = 1 only + """ + def __init__(self, in_channels, out_channels, gin_channels, kernel_size, stride, extend_ratio, r, use_spectral_norm=False): + super(ConvNext2d, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + inner_channels = int(in_channels * extend_ratio) + self.kernel_size = kernel_size + self.dwconv = norm_f(Conv2d(in_channels, in_channels, kernel_size, stride, groups=in_channels)) + self.pwconv1 = LoRALinear2d(in_channels, inner_channels, gin_channels, r=r) + self.pwconv2 = LoRALinear2d(inner_channels, out_channels, gin_channels, r=r) + self.act = nn.GELU() + self.norm = LayerNorm(in_channels) + + def forward(self, x, g): + x = F.pad(x, [0, 0, self.kernel_size[0] - 1, 0], mode="constant") + x = self.dwconv(x) + x = self.norm(x) + x = self.pwconv1(x, g) + x = self.act(x) + x = self.pwconv2(x, g) + x = self.act(x) + return x + + def remove_weight_norm(self): + remove_weight_norm(self.dwconv) + + +class WaveBlock(torch.nn.Module): + def __init__(self, inner_channels, gin_channels, kernel_sizes, strides, dilations, extend_rate, r): + super(WaveBlock, self).__init__() + norm_f = weight_norm + extend_channels = int(inner_channels * extend_rate) + self.dconvs = nn.ModuleList() + self.p1convs = nn.ModuleList() + self.p2convs = nn.ModuleList() + self.norms = nn.ModuleList() + self.act = nn.GELU() + + # self.ses = nn.ModuleList() + # self.norms = [] + for i, (k, s, d) in enumerate(zip(kernel_sizes, strides, dilations)): + self.dconvs.append(DilatedCausalConv1d(inner_channels, inner_channels, k, stride=s, dilation=d, groups=inner_channels)) + self.p1convs.append(LoRALinear1d(inner_channels, extend_channels, gin_channels, r)) + self.p2convs.append(LoRALinear1d(extend_channels, inner_channels, gin_channels, r)) + self.norms.append(LayerNorm(inner_channels)) + + def forward(self, x, x_mask, g): + x *= x_mask + for i in range(len(self.dconvs)): + residual = x.clone() + x = self.dconvs[i](x) + x = self.norms[i](x) + x *= x_mask + x = self.p1convs[i](x, g) + x = self.act(x) + x = self.p2convs[i](x, g) + x = residual + x + return x + + def remove_weight_norm(self): + for c in self.dconvs: + c.remove_weight_norm() + for c in self.p1convs: + c.remove_weight_norm() + for c in self.p2convs: + c.remove_weight_norm() + + def fix_speaker(self, g): + for c in self.p1convs: + c.fix_speaker(g) + for c in self.p2convs: + c.fix_speaker(g) + + def unfix_speaker(self, g): + for c in self.p1convs: + c.unfix_speaker(g) + for c in self.p2convs: + c.unfix_speaker(g) + + +class SnakeFilter(torch.nn.Module): + """ + Adaptive filter using snakebeta + """ + def __init__(self, channels, groups, kernel_size, num_layers, eps=1e-6): + super(SnakeFilter, self).__init__() + self.eps = eps + self.num_layers = num_layers + inner_channels = channels * groups + self.init_conv = DilatedCausalConv1d(1, inner_channels, kernel_size) + self.dconvs = torch.nn.ModuleList() + self.pconvs = torch.nn.ModuleList() + self.post_conv = DilatedCausalConv1d(inner_channels+1, 1, kernel_size, bias=False) + + for i in range(self.num_layers): + self.dconvs.append(DilatedCausalConv1d(inner_channels, inner_channels, kernel_size, stride=1, groups=inner_channels, dilation=kernel_size ** (i + 1))) + self.pconvs.append(weight_norm(Conv1d(inner_channels, inner_channels, 1, groups=groups))) + self.snake_alpha = torch.nn.Parameter(torch.zeros(inner_channels), requires_grad=True) + self.snake_beta = torch.nn.Parameter(torch.zeros(inner_channels), requires_grad=True) + + def forward(self, x): + y = x.clone() + x = self.init_conv(x) + for i in range(self.num_layers): + # snake activation + x = self.dconvs[i](x) + x = self.pconvs[i](x) + x = x + (1.0 / torch.clip(self.snake_beta.unsqueeze(0).unsqueeze(-1), min=self.eps)) * torch.pow(torch.sin(x * self.snake_alpha.unsqueeze(0).unsqueeze(-1)), 2) + x = torch.cat([x, y], 1) + x = self.post_conv(x) + return x + + def remove_weight_norm(self): + self.init_conv.remove_weight_norm() + for c in self.dconvs: + c.remove_weight_norm() + for c in self.pconvs: + remove_weight_norm(c) + self.post_conv.remove_weight_norm() + +""" +https://github.com/charactr-platform/vocos/blob/main/vocos/heads.py +""" +class FourierHead(nn.Module): + """Base class for inverse fourier modules.""" + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Args: + x (Tensor): Input tensor of shape (B, L, H), where B is the batch size, + L is the sequence length, and H denotes the model dimension. + + Returns: + Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal. + """ + raise NotImplementedError("Subclasses must implement the forward method.") + + +class IMDCT(nn.Module): + """ + Inverse Modified Discrete Cosine Transform (IMDCT) module. + + Args: + frame_len (int): Length of the MDCT frame. + padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same". + """ + + def __init__(self, frame_len: int, padding: str = "same"): + super().__init__() + if padding not in ["center", "same"]: + raise ValueError("Padding must be 'center' or 'same'.") + self.padding = padding + self.frame_len = frame_len * 2 + N = frame_len + n0 = (N + 1) / 2 + window = torch.from_numpy(scipy.signal.cosine(N * 2)).float() + self.register_buffer("window", window) + + pre_twiddle = torch.exp(1j * torch.pi * n0 * torch.arange(N * 2) / N) + post_twiddle = torch.exp(1j * torch.pi * (torch.arange(N * 2) + n0) / (N * 2)) + self.register_buffer("pre_twiddle", torch.view_as_real(pre_twiddle)) + self.register_buffer("post_twiddle", torch.view_as_real(post_twiddle)) + + def forward(self, X: torch.Tensor) -> torch.Tensor: + """ + Apply the Inverse Modified Discrete Cosine Transform (IMDCT) to the input MDCT coefficients. + + Args: + X (Tensor): Input MDCT coefficients of shape (B, N, L), where B is the batch size, + L is the number of frames, and N is the number of frequency bins. + + Returns: + Tensor: Reconstructed audio waveform of shape (B, T), where T is the length of the audio. + """ + X = X.transpose(1, 2) + B, L, N = X.shape + Y = torch.zeros((B, L, N * 2), dtype=X.dtype, device=X.device) + Y[..., :N] = X + Y[..., N:] = -1 * torch.conj(torch.flip(X, dims=(-1,))) + y = torch.fft.ifft(Y * torch.view_as_complex(self.pre_twiddle).expand(Y.shape), dim=-1) + y = torch.real(y * torch.view_as_complex(self.post_twiddle).expand(y.shape)) * np.sqrt(N) * np.sqrt(2) + result = y * self.window.expand(y.shape) + output_size = (1, (L + 1) * N) + audio = torch.nn.functional.fold( + result.transpose(1, 2), + output_size=output_size, + kernel_size=(1, self.frame_len), + stride=(1, self.frame_len // 2), + )[:, 0, 0, :] + + if self.padding == "center": + pad = self.frame_len // 2 + elif self.padding == "same": + pad = self.frame_len // 4 + else: + raise ValueError("Padding must be 'center' or 'same'.") + + audio = audio[:, pad:-pad] + return audio.unsqueeze(1) + + +class IMDCTSymExpHead(FourierHead): + """ + IMDCT Head module for predicting MDCT coefficients with symmetric exponential function + + Args: + dim (int): Hidden dimension of the model. + mdct_frame_len (int): Length of the MDCT frame. + padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same". + sample_rate (int, optional): The sample rate of the audio. If provided, the last layer will be initialized + based on perceptual scaling. Defaults to None. + clip_audio (bool, optional): Whether to clip the audio output within the range of [-1.0, 1.0]. Defaults to False. + """ + + def __init__( + self, dim: int, gin_channels: int, mdct_frame_len: int, padding: str = "same", sample_rate: int = 24000, + ): + super().__init__() + out_dim = mdct_frame_len + self.dconv = DilatedCausalConv1d(dim, dim, 5, 1, dim, 1) + self.pconv1 = LoRALinear1d(dim, dim * 2, gin_channels, 2) + self.pconv2 = LoRALinear1d(dim * 2, out_dim, gin_channels, 2) + self.act = torch.nn.GELU() + self.imdct = IMDCT(frame_len=mdct_frame_len, padding=padding) + + if sample_rate is not None: + # optionally init the last layer following mel-scale + m_max = _hz_to_mel(sample_rate // 2) + m_pts = torch.linspace(0, m_max, out_dim) + f_pts = _mel_to_hz(m_pts) + scale = 1 - (f_pts / f_pts.max()) + + with torch.no_grad(): + self.pconv2.main_fc.weight.mul_(scale.view(-1, 1, 1)) + + def forward(self, x: torch.Tensor, g: torch.Tensor) -> torch.Tensor: + """ + Forward pass of the IMDCTSymExpHead module. + + Args: + x (Tensor): Input tensor of shape (B, L, H), where B is the batch size, + L is the sequence length, and H denotes the model dimension. + + Returns: + Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal. + """ + x = self.dconv(x) + x = self.pconv1(x, g) + x = self.act(x) + x = self.pconv2(x, g) + x = symexp(x) + x = torch.clip(x, min=-1e2, max=1e2) # safeguard to prevent excessively large magnitudes + audio = self.imdct(x) + return audio + + def remove_weight_norm(self): + self.dconv.remove_weight_norm() + self.pconv1.remove_weight_norm() + self.pconv2.remove_weight_norm() + + def fix_speaker(self, g): + self.pconv1.fix_speaker(g) + self.pconv2.fix_speaker(g) + + def unfix_speaker(self, g): + self.pconv1.unfix_speaker(g) + self.pconv2.unfix_speaker(g) + +def symexp(x: torch.Tensor) -> torch.Tensor: + return torch.sign(x) * (torch.exp(x.abs()) - 1) \ No newline at end of file diff --git a/server/voice_changer/RVC/inferencer/model_v3/transforms.py b/server/voice_changer/RVC/inferencer/voras_beta/transforms.py similarity index 100% rename from server/voice_changer/RVC/inferencer/model_v3/transforms.py rename to server/voice_changer/RVC/inferencer/voras_beta/transforms.py diff --git a/server/voice_changer/RVC/inferencer/model_v3/utils.py b/server/voice_changer/RVC/inferencer/voras_beta/utils.py similarity index 100% rename from server/voice_changer/RVC/inferencer/model_v3/utils.py rename to server/voice_changer/RVC/inferencer/voras_beta/utils.py diff --git a/server/voice_changer/RVC/pipeline/Pipeline.py b/server/voice_changer/RVC/pipeline/Pipeline.py index 1d8fbd0c..4e24acb1 100644 --- a/server/voice_changer/RVC/pipeline/Pipeline.py +++ b/server/voice_changer/RVC/pipeline/Pipeline.py @@ -3,6 +3,7 @@ from typing import Any import math import torch import torch.nn.functional as F +from torch.cuda.amp import autocast from Exceptions import ( DeviceCannotSupportHalfPrecisionException, DeviceChangingException, @@ -118,10 +119,6 @@ class Pipeline(object): # tensor型調整 feats = audio_pad - if self.isHalf is True: - feats = feats.half() - else: - feats = feats.float() if feats.dim() == 2: # double channels feats = feats.mean(-1) assert feats.dim() == 1, feats.dim() @@ -129,19 +126,20 @@ class Pipeline(object): # embedding padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) - try: - feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj) - if torch.isnan(feats).all(): - raise DeviceCannotSupportHalfPrecisionException() - except RuntimeError as e: - if "HALF" in e.__str__().upper(): - raise HalfPrecisionChangingException() - elif "same device" in e.__str__(): - raise DeviceChangingException() - else: - raise e - if protect < 0.5 and search_index: - feats0 = feats.clone() + with autocast(enabled=self.isHalf): + try: + feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj) + if torch.isnan(feats).all(): + raise DeviceCannotSupportHalfPrecisionException() + except RuntimeError as e: + if "HALF" in e.__str__().upper(): + raise HalfPrecisionChangingException() + elif "same device" in e.__str__(): + raise DeviceChangingException() + else: + raise e + if protect < 0.5 and search_index: + feats0 = feats.clone() # Index - feature抽出 # if self.index is not None and self.feature is not None and index_rate != 0: @@ -167,10 +165,8 @@ class Pipeline(object): # recover silient font npy = np.concatenate([np.zeros([npyOffset, npy.shape[1]]).astype("float32"), npy]) - if self.isHalf is True: - npy = npy.astype("float16") - feats = torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) if protect < 0.5 and search_index: feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) @@ -207,14 +203,15 @@ class Pipeline(object): # 推論実行 try: with torch.no_grad(): - audio1 = ( - torch.clip( - self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0].to(dtype=torch.float32), - -1.0, - 1.0, - ) - * 32767.5 - ).data.to(dtype=torch.int16) + with autocast(enabled=self.isHalf): + audio1 = ( + torch.clip( + self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0].to(dtype=torch.float32), + -1.0, + 1.0, + ) + * 32767.5 + ).data.to(dtype=torch.int16) except RuntimeError as e: if "HALF" in e.__str__().upper(): print("11", e)