diff --git a/server/voice_changer/MMVCv13/MMVCv13.py b/server/voice_changer/MMVCv13/MMVCv13.py index e314b371..10f5144e 100644 --- a/server/voice_changer/MMVCv13/MMVCv13.py +++ b/server/voice_changer/MMVCv13/MMVCv13.py @@ -2,7 +2,6 @@ import sys import os from data.ModelSlot import MMVCv13ModelSlot -from voice_changer.utils.LoadModelParams import LoadModelParams, LoadModelParams2 from voice_changer.utils.VoiceChangerModel import AudioInOut if sys.platform.startswith("darwin"): @@ -22,8 +21,10 @@ import numpy as np import torch import onnxruntime -from symbols import symbols # type:ignore -from models import SynthesizerTrn # type:ignore +# from symbols import symbols # type:ignore +# from models import SynthesizerTrn # type:ignore +from voice_changer.MMVCv13.models.models import SynthesizerTrn +from voice_changer.MMVCv13.models.symbols import symbols from voice_changer.MMVCv13.TrainerFunctions import ( TextAudioSpeakerCollate, spectrogram_torch, @@ -40,21 +41,15 @@ class MMVCv13Settings: srcId: int = 0 dstId: int = 101 - framework: str = "PyTorch" # PyTorch or ONNX - pyTorchModelFile: str = "" - onnxModelFile: str = "" - configFile: str = "" - # ↓mutableな物だけ列挙 intData = ["gpu", "srcId", "dstId"] floatData: list[str] = field(default_factory=lambda: []) - strData = ["framework"] + strData: list[str] = field(default_factory=lambda: []) class MMVCv13: - audio_buffer: AudioInOut | None = None - - def __init__(self): + def __init__(self, slotInfo: MMVCv13ModelSlot): + print("[Voice Changer] [MMVCv13] Creating instance ") self.settings = MMVCv13Settings() self.net_g = None self.onnx_session = None @@ -62,43 +57,35 @@ class MMVCv13: self.gpu_num = torch.cuda.device_count() self.text_norm = torch.LongTensor([0, 6, 0]) - def loadModel(self, props: LoadModelParams): - params = props.params + self.audio_buffer: AudioInOut | None = None + self.slotInfo = slotInfo + self.initialize() - self.settings.configFile = params["files"]["mmvcv13Config"] - self.hps = get_hparams_from_file(self.settings.configFile) + def initialize(self): + print("[Voice Changer] [MMVCv13] Initializing... ") - modelFile = params["files"]["mmvcv13Model"] - if modelFile.endswith(".onnx"): - self.settings.pyTorchModelFile = None - self.settings.onnxModelFile = modelFile - else: - self.settings.pyTorchModelFile = modelFile - self.settings.onnxModelFile = None - - # PyTorchモデル生成 - if self.settings.pyTorchModelFile is not None: - self.net_g = SynthesizerTrn(len(symbols), self.hps.data.filter_length // 2 + 1, self.hps.train.segment_size // self.hps.data.hop_length, n_speakers=self.hps.data.n_speakers, **self.hps.model) - self.net_g.eval() - load_checkpoint(self.settings.pyTorchModelFile, self.net_g, None) - - # ONNXモデル生成 - if self.settings.onnxModelFile is not None: - # ort_options = onnxruntime.SessionOptions() - # ort_options.intra_op_num_threads = 8 - # ort_options.execution_mode = ort_options.ExecutionMode.ORT_PARALLEL - # ort_options.inter_op_num_threads = 8 + self.hps = get_hparams_from_file(self.slotInfo.configFile) + if self.slotInfo.isONNX: providers, options = self.getOnnxExecutionProvider() self.onnx_session = onnxruntime.InferenceSession( - self.settings.onnxModelFile, + self.slotInfo.modelFile, providers=providers, provider_options=options, ) - return self.get_info() + else: + self.net_g = SynthesizerTrn(len(symbols), self.hps.data.filter_length // 2 + 1, self.hps.train.segment_size // self.hps.data.hop_length, n_speakers=self.hps.data.n_speakers, **self.hps.model) + self.net_g.eval() + load_checkpoint(self.slotInfo.modelFile, self.net_g, None) + + # その他の設定 + self.settings.srcId = self.slotInfo.srcId + self.settings.dstId = self.slotInfo.dstId + print("[Voice Changer] [MMVCv13] Initializing... done") def getOnnxExecutionProvider(self): availableProviders = onnxruntime.get_available_providers() - if self.settings.gpu >= 0 and "CUDAExecutionProvider" in availableProviders: + devNum = torch.cuda.device_count() + if self.settings.gpu >= 0 and "CUDAExecutionProvider" in availableProviders and devNum > 0: return ["CUDAExecutionProvider"], [{"device_id": self.settings.gpu}] elif self.settings.gpu >= 0 and "DmlExecutionProvider" in availableProviders: return ["DmlExecutionProvider"], [{}] @@ -111,21 +98,15 @@ class MMVCv13: } ] - def isOnnx(self): - if self.settings.onnxModelFile is not None: - return True - else: - return False - def update_settings(self, key: str, val: int | float | str): if key in self.settings.intData: val = int(val) setattr(self.settings, key, val) - if key == "gpu" and self.isOnnx(): + if key == "gpu" and self.slotInfo.isONNX: providers, options = self.getOnnxExecutionProvider() self.onnx_session = onnxruntime.InferenceSession( - self.settings.onnxModelFile, + self.slotInfo.modelFile, providers=providers, provider_options=options, ) @@ -150,13 +131,6 @@ class MMVCv13: data = asdict(self.settings) data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session is not None else [] - files = ["configFile", "pyTorchModelFile", "onnxModelFile"] - for f in files: - if data[f] is not None and os.path.exists(data[f]): - data[f] = os.path.basename(data[f]) - else: - data[f] = "" - return data def get_processing_sampling_rate(self): @@ -211,7 +185,7 @@ class MMVCv13: return data def _onnx_inference(self, data): - if hasattr(self, "onnx_session") is False or self.onnx_session is None: + if self.onnx_session is None: print("[Voice Changer] No ONNX session.") raise NoModeLoadedException("ONNX") @@ -254,24 +228,12 @@ class MMVCv13: return result def inference(self, data): - if self.isOnnx(): + if self.slotInfo.isONNX: audio = self._onnx_inference(data) else: audio = self._pyTorch_inference(data) return audio - @classmethod - def loadModel2(cls, props: LoadModelParams2): - slotInfo: MMVCv13ModelSlot = MMVCv13ModelSlot() - for file in props.files: - if file.kind == "mmvcv13Model": - slotInfo.modelFile = file.name - elif file.kind == "mmvcv13Config": - slotInfo.configFile = file.name - slotInfo.isONNX = slotInfo.modelFile.endswith(".onnx") - slotInfo.name = os.path.splitext(os.path.basename(slotInfo.modelFile))[0] - return slotInfo - def __del__(self): del self.net_g del self.onnx_session diff --git a/server/voice_changer/MMVCv13/MMVCv13ModelSlotGenerator.py b/server/voice_changer/MMVCv13/MMVCv13ModelSlotGenerator.py new file mode 100644 index 00000000..40bcc250 --- /dev/null +++ b/server/voice_changer/MMVCv13/MMVCv13ModelSlotGenerator.py @@ -0,0 +1,19 @@ +import os + +from data.ModelSlot import MMVCv13ModelSlot +from voice_changer.utils.LoadModelParams import LoadModelParams +from voice_changer.utils.ModelSlotGenerator import ModelSlotGenerator + + +class MMVCv13ModelSlotGenerator(ModelSlotGenerator): + @classmethod + def loadModel(cls, props: LoadModelParams): + slotInfo: MMVCv13ModelSlot = MMVCv13ModelSlot() + for file in props.files: + if file.kind == "mmvcv13Model": + slotInfo.modelFile = file.name + elif file.kind == "mmvcv13Config": + slotInfo.configFile = file.name + slotInfo.isONNX = slotInfo.modelFile.endswith(".onnx") + slotInfo.name = os.path.splitext(os.path.basename(slotInfo.modelFile))[0] + return slotInfo diff --git a/server/voice_changer/MMVCv13/models/commons.py b/server/voice_changer/MMVCv13/models/commons.py new file mode 100644 index 00000000..b0928f92 --- /dev/null +++ b/server/voice_changer/MMVCv13/models/commons.py @@ -0,0 +1,27 @@ +import torch + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size * dilation - dilation) / 2) + + +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +def sequence_mask(length, max_length=None): + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) diff --git a/server/voice_changer/MMVCv13/models/models.py b/server/voice_changer/MMVCv13/models/models.py new file mode 100644 index 00000000..95db7044 --- /dev/null +++ b/server/voice_changer/MMVCv13/models/models.py @@ -0,0 +1,166 @@ +import torch +from torch import nn +from torch.nn import functional as F + +from .modules import ResidualCouplingLayer, Flip, WN, ResBlock1, ResBlock2, LRELU_SLOPE + + +from torch.nn import Conv1d, ConvTranspose1d +from torch.nn.utils import weight_norm, remove_weight_norm +from .commons import init_weights, sequence_mask + + +class ResidualCouplingBlock(nn.Module): + def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows=4, gin_channels=0): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for i in range(n_flows): + self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True)) + self.flows.append(Flip()) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x + + +class PosteriorEncoder(nn.Module): + def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + # self.randn = torch.randn(1, 1, 1) # ダミーで初期化 + + def forward(self, x, x_lengths, g=None): + x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + # if self.randn.size() != m.size(): # m の形が違う時だけ生成 + self.randn = torch.randn_like(m) + z = (m + self.randn * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + +class Generator(torch.nn.Module): + def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) + resblock = ResBlock1 if resblock == "1" else ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append(weight_norm(ConvTranspose1d(upsample_initial_channel // (2**i), upsample_initial_channel // (2 ** (i + 1)), k, u, padding=(k - u) // 2))) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + # self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + gin_channels = 0 + + def forward(self, x, g=None): + x = self.conv_pre(x) + if g is not None: + # x = x + self.cond(g) + g = None + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + print("Removing weight norm...") + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +class SynthesizerTrn(nn.Module): + """ + Synthesizer for Training + """ + + def __init__(self, n_vocab, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, n_flow, n_speakers=0, gin_channels=0, use_sdp=True, **kwargs): + super().__init__() + self.n_vocab = n_vocab + self.spec_channels = spec_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.n_speakers = n_speakers + self.gin_channels = gin_channels + self.use_sdp = use_sdp + + self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) + self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) + self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, n_flows=n_flow, gin_channels=gin_channels) + + if n_speakers > 1: + self.emb_g = nn.Embedding(n_speakers, gin_channels) + + def forward(self, y, y_lengths, sid_src, sid_tgt): + return self.voice_conversion(y, y_lengths, sid_src, sid_tgt) + + def voice_conversion(self, y, y_lengths, sid_src, sid_tgt): + assert self.n_speakers > 0, "n_speakers have to be larger than 0." + g_src = self.emb_g(sid_src).unsqueeze(-1) + g_tgt = self.emb_g(sid_tgt).unsqueeze(-1) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src) + z_p = self.flow(z, y_mask, g=g_src) + z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True) + o_hat = self.dec(z_hat * y_mask, g=g_tgt) + return o_hat diff --git a/server/voice_changer/MMVCv13/models/modules.py b/server/voice_changer/MMVCv13/models/modules.py new file mode 100644 index 00000000..14889c8e --- /dev/null +++ b/server/voice_changer/MMVCv13/models/modules.py @@ -0,0 +1,186 @@ +import torch +from torch import nn +from torch.nn import functional as F + +from torch.nn import Conv1d +from torch.nn.utils import weight_norm, remove_weight_norm + +from .commons import init_weights, get_padding, fused_add_tanh_sigmoid_multiply + + +LRELU_SLOPE = 0.1 + + +class WN(torch.nn.Module): + def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): + super(WN, self).__init__() + assert kernel_size % 2 == 1 + self.hidden_channels = hidden_channels + self.kernel_size = (kernel_size,) + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = p_dropout + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = nn.Dropout(p_dropout) + + if gin_channels != 0: + cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1) + self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") + + for i in range(n_layers): + dilation = dilation_rate**i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=padding) + in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") + self.in_layers.append(in_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels + + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") + self.res_skip_layers.append(res_skip_layer) + + def forward(self, x, x_mask, g=None, **kwargs): + output = torch.zeros_like(x) + n_channels_tensor = torch.IntTensor([self.hidden_channels]) + + if g is not None: + g = self.cond_layer(g) + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + if g is not None: + cond_offset = i * 2 * self.hidden_channels + g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] + else: + g_l = torch.zeros_like(x_in) + + acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) + acts = self.drop(acts) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:, : self.hidden_channels, :] + x = (x + res_acts) * x_mask + output = output + res_skip_acts[:, self.hidden_channels :, :] + else: + output = output + res_skip_acts + return output * x_mask + + def remove_weight_norm(self): + if self.gin_channels != 0: + torch.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + torch.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + torch.nn.utils.remove_weight_norm(l) + + +class ResBlock1(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.convs1 = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], padding=get_padding(kernel_size, dilation[2])))]) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)))]) + self.convs2.apply(init_weights) + + def forward(self, x, x_mask=None): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c2(xt) + x = xt + x + if x_mask is not None: + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class ResBlock2(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__() + self.convs = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1])))]) + self.convs.apply(init_weights) + + def forward(self, x, x_mask=None): + for c in self.convs: + xt = F.leaky_relu(x, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c(xt) + x = xt + x + if x_mask is not None: + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +class Flip(nn.Module): + def forward(self, x, *args, reverse=False, **kwargs): + x = torch.flip(x, [1]) + if not reverse: + logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) + return x, logdet + else: + return x + + +class ResidualCouplingLayer(nn.Module): + def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, gin_channels=0, mean_only=False): + assert channels % 2 == 0, "channels should be divisible by 2" + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only + + self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) + self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels) + self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) + self.post.weight.data.zero_() + self.post.bias.data.zero_() + + def forward(self, x, x_mask, g=None, reverse=False): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) * x_mask + h = self.enc(h, x_mask, g=g) + stats = self.post(h) * x_mask + if not self.mean_only: + m, logs = torch.split(stats, [self.half_channels] * 2, 1) + else: + m = stats + logs = torch.zeros_like(m) + + if not reverse: + x1 = m + x1 * torch.exp(logs) * x_mask + x = torch.cat([x0, x1], 1) + logdet = torch.sum(logs, [1, 2]) + return x, logdet + else: + x1 = (x1 - m) * torch.exp(-logs) * x_mask + x = torch.cat([x0, x1], 1) + return x diff --git a/server/voice_changer/MMVCv13/models/readme.txt b/server/voice_changer/MMVCv13/models/readme.txt new file mode 100644 index 00000000..445357b5 --- /dev/null +++ b/server/voice_changer/MMVCv13/models/readme.txt @@ -0,0 +1 @@ +modules in this folder from https://github.com/isletennos/MMVC_Client.git at 04f3fec4fd82dea6657026ec4e1cd80fb29a415c \ No newline at end of file diff --git a/server/voice_changer/MMVCv13/models/symbols.py b/server/voice_changer/MMVCv13/models/symbols.py new file mode 100644 index 00000000..ce6ca0c9 --- /dev/null +++ b/server/voice_changer/MMVCv13/models/symbols.py @@ -0,0 +1,64 @@ +""" The following information was added with reference to https://github.com/jaywalnut310/vits/tree/1eef52ed50743f77fca9ff6773ba673497f6bf9d """ +""" from https://github.com/keithito/tacotron """ + +""" +Defines the set of symbols used in text input to the model. +""" +_pad = "_" +_punctuation = ';:,.!?¡¿—…"«»“” ' +_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" +_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" + + +# Export all symbols: +symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) + +# Special symbol ids +SPACE_ID = symbols.index(" ") + +symbols = [ + "A", + "E", + "I", + "N", + "O", + "U", + "a", + "b", + "by", + "ch", + "cl", + "d", + "dy", + "e", + "f", + "g", + "gy", + "h", + "hy", + "i", + "j", + "k", + "ky", + "m", + "my", + "n", + "ny", + "o", + "p", + "py", + "r", + "ry", + "s", + "sh", + "t", + "ts", + "ty", + "u", + "v", + "w", + "y", + "z", + "pau", + "sil", +] diff --git a/server/voice_changer/MMVCv15/MMVCv15.py b/server/voice_changer/MMVCv15/MMVCv15.py index 4edc1cad..73275625 100644 --- a/server/voice_changer/MMVCv15/MMVCv15.py +++ b/server/voice_changer/MMVCv15/MMVCv15.py @@ -1,8 +1,6 @@ import sys import os from data.ModelSlot import MMVCv15ModelSlot - -from voice_changer.utils.LoadModelParams import LoadModelParams, LoadModelParams2 from voice_changer.utils.VoiceChangerModel import AudioInOut if sys.platform.startswith("darwin"): @@ -49,42 +47,29 @@ class MMVCv15Settings: f0Factor: float = 1.0 f0Detector: str = "dio" # dio or harvest - framework: str = "PyTorch" # PyTorch or ONNX - pyTorchModelFile: str = "" - onnxModelFile: str = "" - configFile: str = "" - # ↓mutableな物だけ列挙 intData = ["gpu", "srcId", "dstId"] floatData = ["f0Factor"] - strData = ["framework", "f0Detector"] + strData = ["f0Detector"] class MMVCv15: - audio_buffer: AudioInOut | None = None - - def __init__(self): + def __init__(self, slotInfo: MMVCv15ModelSlot): + print("[Voice Changer] [MMVCv15] Creating instance ") self.settings = MMVCv15Settings() self.net_g = None - self.onnx_session = None + self.onnx_session: onnxruntime.InferenceSession | None = None self.gpu_num = torch.cuda.device_count() - def loadModel(self, props: LoadModelParams): - params = props.params + self.slotInfo = slotInfo + self.audio_buffer: AudioInOut | None = None + self.initialize() - self.settings.configFile = params["files"]["mmvcv15Config"] - self.hps = get_hparams_from_file(self.settings.configFile) + def initialize(self): + print("[Voice Changer] [MMVCv15] Initializing... ") + self.hps = get_hparams_from_file(self.slotInfo.configFile) - modelFile = params["files"]["mmvcv15Model"] - if modelFile.endswith(".onnx"): - self.settings.pyTorchModelFile = None - self.settings.onnxModelFile = modelFile - else: - self.settings.pyTorchModelFile = modelFile - self.settings.onnxModelFile = None - - # PyTorchモデル生成 self.net_g = SynthesizerTrn( spec_channels=self.hps.data.filter_length // 2 + 1, segment_size=self.hps.train.segment_size // self.hps.data.hop_length, @@ -103,18 +88,12 @@ class MMVCv15: requires_grad_text_enc=self.hps.requires_grad.text_enc, requires_grad_dec=self.hps.requires_grad.dec, ) - if self.settings.pyTorchModelFile is not None: - self.settings.framework = "PyTorch" - self.net_g.eval() - load_checkpoint(self.settings.pyTorchModelFile, self.net_g, None) - # ONNXモデル生成 - self.onxx_input_length = 8192 - if self.settings.onnxModelFile is not None: - self.settings.framework = "ONNX" + if self.slotInfo.isONNX: + self.onxx_input_length = 8192 providers, options = self.getOnnxExecutionProvider() self.onnx_session = onnxruntime.InferenceSession( - self.settings.onnxModelFile, + self.slotInfo.modelFile, providers=providers, provider_options=options, ) @@ -123,11 +102,21 @@ class MMVCv15: # print("ONNX INPUT SHAPE", i.name, i.shape) if i.name == "sin": self.onxx_input_length = i.shape[2] - return self.get_info() + else: + self.net_g.eval() + load_checkpoint(self.slotInfo.modelFile, self.net_g, None) + + # その他の設定 + self.settings.srcId = self.slotInfo.srcId + self.settings.dstId = self.slotInfo.dstId + self.settings.f0Factor = self.slotInfo.f0Factor + + print("[Voice Changer] [MMVCv15] Initializing... done") def getOnnxExecutionProvider(self): availableProviders = onnxruntime.get_available_providers() - if self.settings.gpu >= 0 and "CUDAExecutionProvider" in availableProviders: + devNum = torch.cuda.device_count() + if self.settings.gpu >= 0 and "CUDAExecutionProvider" in availableProviders and devNum > 0: return ["CUDAExecutionProvider"], [{"device_id": self.settings.gpu}] elif self.settings.gpu >= 0 and "DmlExecutionProvider" in availableProviders: return ["DmlExecutionProvider"], [{}] @@ -140,20 +129,14 @@ class MMVCv15: } ] - def isOnnx(self): - if self.settings.onnxModelFile is not None: - return True - else: - return False - def update_settings(self, key: str, val: int | float | str): if key in self.settings.intData: val = int(val) setattr(self.settings, key, val) - if key == "gpu" and self.isOnnx(): + if key == "gpu" and self.slotInfo.isONNX: providers, options = self.getOnnxExecutionProvider() self.onnx_session = onnxruntime.InferenceSession( - self.settings.onnxModelFile, + self.slotInfo.modelFile, providers=providers, provider_options=options, ) @@ -174,12 +157,6 @@ class MMVCv15: data = asdict(self.settings) data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.settings.onnxModelFile != "" and self.settings.onnxModelFile is not None else [] - files = ["configFile", "pyTorchModelFile", "onnxModelFile"] - for f in files: - if data[f] is not None and os.path.exists(data[f]): - data[f] = os.path.basename(data[f]) - else: - data[f] = "" return data @@ -241,7 +218,7 @@ class MMVCv15: convertSize = convertSize + (self.hps.data.hop_length - (convertSize % self.hps.data.hop_length)) # ONNX は固定長 - if self.settings.framework == "ONNX": + if self.slotInfo.isONNX: convertSize = self.onxx_input_length convertOffset = -1 * convertSize @@ -286,10 +263,6 @@ class MMVCv15: return audio1 def _pyTorch_inference(self, data): - if self.settings.pyTorchModelFile == "" or self.settings.pyTorchModelFile is None: - print("[Voice Changer] No pyTorch session.") - raise NoModeLoadedException("pytorch") - if self.settings.gpu < 0 or self.gpu_num == 0: dev = torch.device("cpu") else: @@ -309,7 +282,7 @@ class MMVCv15: def inference(self, data): try: - if self.isOnnx(): + if self.slotInfo.isONNX: audio = self._onnx_inference(data) else: audio = self._pyTorch_inference(data) @@ -318,18 +291,6 @@ class MMVCv15: print(_e) raise ONNXInputArgumentException() - @classmethod - def loadModel2(cls, props: LoadModelParams2): - slotInfo: MMVCv15ModelSlot = MMVCv15ModelSlot() - for file in props.files: - if file.kind == "mmvcv15Model": - slotInfo.modelFile = file.name - elif file.kind == "mmvcv15Config": - slotInfo.configFile = file.name - slotInfo.isONNX = slotInfo.modelFile.endswith(".onnx") - slotInfo.name = os.path.splitext(os.path.basename(slotInfo.modelFile))[0] - return slotInfo - def __del__(self): del self.net_g del self.onnx_session diff --git a/server/voice_changer/MMVCv15/MMVCv15ModelSlotGenerator.py b/server/voice_changer/MMVCv15/MMVCv15ModelSlotGenerator.py new file mode 100644 index 00000000..101e76ca --- /dev/null +++ b/server/voice_changer/MMVCv15/MMVCv15ModelSlotGenerator.py @@ -0,0 +1,19 @@ +import os + +from data.ModelSlot import MMVCv15ModelSlot +from voice_changer.utils.LoadModelParams import LoadModelParams +from voice_changer.utils.ModelSlotGenerator import ModelSlotGenerator + + +class MMVCv15ModelSlotGenerator(ModelSlotGenerator): + @classmethod + def loadModel(cls, props: LoadModelParams): + slotInfo: MMVCv15ModelSlot = MMVCv15ModelSlot() + for file in props.files: + if file.kind == "mmvcv15Model": + slotInfo.modelFile = file.name + elif file.kind == "mmvcv15Config": + slotInfo.configFile = file.name + slotInfo.isONNX = slotInfo.modelFile.endswith(".onnx") + slotInfo.name = os.path.splitext(os.path.basename(slotInfo.modelFile))[0] + return slotInfo diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py index 0bd75c64..04fd37c6 100644 --- a/server/voice_changer/RVC/RVC.py +++ b/server/voice_changer/RVC/RVC.py @@ -35,26 +35,17 @@ from Exceptions import DeviceCannotSupportHalfPrecisionException class RVC(VoiceChangerModel): - initialLoad: bool = True - settings: RVCSettings = RVCSettings() - - pipeline: Pipeline | None = None - - deviceManager = DeviceManager.get_instance() - - audio_buffer: AudioInOut | None = None - prevVol: float = 0 - params: VoiceChangerParams - currentSlot: int = 0 - needSwitch: bool = False - def __init__(self, params: VoiceChangerParams, slotInfo: RVCModelSlot): print("[Voice Changer] [RVC] Creating instance ") + self.deviceManager = DeviceManager.get_instance() EmbedderManager.initialize(params) - + self.settings = RVCSettings() self.params = params self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector) + self.pipeline: Pipeline | None = None + + self.audio_buffer: AudioInOut | None = None self.prevVol = 0.0 self.slotInfo = slotInfo self.initialize() diff --git a/server/voice_changer/VoiceChangerManager.py b/server/voice_changer/VoiceChangerManager.py index 894c7d3b..f4d64206 100644 --- a/server/voice_changer/VoiceChangerManager.py +++ b/server/voice_changer/VoiceChangerManager.py @@ -116,14 +116,14 @@ class VoiceChangerManager(ServerDeviceCallbacks): slotInfo = RVCModelSlotGenerator.loadModel(params) self.modelSlotManager.save_model_slot(params.slot, slotInfo) elif params.voiceChangerType == "MMVCv13": - from voice_changer.MMVCv13.MMVCv13 import MMVCv13 + from voice_changer.MMVCv13.MMVCv13ModelSlotGenerator import MMVCv13ModelSlotGenerator - slotInfo = MMVCv13.loadModel(params) + slotInfo = MMVCv13ModelSlotGenerator.loadModel(params) self.modelSlotManager.save_model_slot(params.slot, slotInfo) elif params.voiceChangerType == "MMVCv15": - from voice_changer.MMVCv15.MMVCv15 import MMVCv15 + from voice_changer.MMVCv15.MMVCv15ModelSlotGenerator import MMVCv15ModelSlotGenerator - slotInfo = MMVCv15.loadModel(params) + slotInfo = MMVCv15ModelSlotGenerator.loadModel(params) self.modelSlotManager.save_model_slot(params.slot, slotInfo) elif params.voiceChangerType == "so-vits-svc-40": from voice_changer.SoVitsSvc40.SoVitsSvc40 import SoVitsSvc40 @@ -174,7 +174,20 @@ class VoiceChangerManager(ServerDeviceCallbacks): self.voiceChangerModel = RVC(self.params, slotInfo) self.voiceChanger = VoiceChanger(self.params) self.voiceChanger.setModel(self.voiceChangerModel) + elif slotInfo.voiceChangerType == "MMVCv13": + print("................MMVCv13") + from voice_changer.MMVCv13.MMVCv13 import MMVCv13 + self.voiceChangerModel = MMVCv13(slotInfo) + self.voiceChanger = VoiceChanger(self.params) + self.voiceChanger.setModel(self.voiceChangerModel) + elif slotInfo.voiceChangerType == "MMVCv15": + print("................MMVCv15") + from voice_changer.MMVCv15.MMVCv15 import MMVCv15 + + self.voiceChangerModel = MMVCv15(slotInfo) + self.voiceChanger = VoiceChanger(self.params) + self.voiceChanger.setModel(self.voiceChangerModel) else: print(f"[Voice Changer] unknown voice changer model: {slotInfo.voiceChangerType}") del self.voiceChangerModel