diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py index dadcb033..4f4da9ba 100644 --- a/server/voice_changer/RVC/RVC.py +++ b/server/voice_changer/RVC/RVC.py @@ -32,7 +32,7 @@ import pyworld as pw from voice_changer.RVC.custom_vc_infer_pipeline import VC from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono from .models import SynthesizerTrnMsNSFsid as SynthesizerTrnMs768NSFsid -from .const import RVC_MODEL_TYPE_NORMAL, RVC_MODEL_TYPE_PITCH_LESS, RVC_MODEL_TYPE_NORMAL_768, RVC_MODEL_TYPE_UNKNOWN +from .const import RVC_MODEL_TYPE_NORMAL, RVC_MODEL_TYPE_PITCH_LESS, RVC_MODEL_TYPE_WEBUI_256_NORMAL, RVC_MODEL_TYPE_WEBUI_768_NORMAL, RVC_MODEL_TYPE_UNKNOWN from fairseq import checkpoint_utils providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"] @@ -155,34 +155,49 @@ class RVC: if pyTorchModelFile != None and pyTorchModelFile != "": cpt = torch.load(pyTorchModelFile, map_location="cpu") ''' - ※ ノーマル or Pitchレス判定 ⇒ コンフィグのpsamplingrateの形状から判断 - ■ ノーマル - [1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 48000] - ■ピッチレス - [1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2],  512, [16, 16, 4, 4],109, 256, 40000] - - 12番目の要素upsamplingrateの数で判定。4: ピッチレス, 5:ノーマル - - - ※ 256 or 768判定 ⇒ config全体の形状 + (1) オリジナルとrvc-webuiのモデル判定 ⇒ config全体の形状 ■ ノーマル256 [1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 48000] ■ ノーマル 768対応 [1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 768, 48000] - config全体の長さで判定 ⇒ config全体の形状 + ⇒ 18: オリジナル, 19: rvc-webui + + (2-1) オリジナルのノーマルorPitchレス判定 ⇒ コンフィグのpsamplingrateの形状から判断 + ■ ノーマル + [1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 48000] + ■ピッチレス + [1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2],  512, [16, 16, 4, 4],109, 256, 40000] + 12番目の要素upsamplingrateの数で判定。4: ピッチレス, 5:ノーマル + + (2-2) rvc-webuiの、(256 or 768) x (ノーマルor pitchレス)判定 ⇒ 256, or 768 は17番目の要素で判定。 + ■ 256 x ノーマル + [1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 256, 48000] + ■ 256 x pitchレス + [1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 256, 48000] + ■ 768 x ノーマル + [1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 768, 48000] + ■ 768 x pitchレス + [1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 768, 48000] + ''' + print("config shape:::::", cpt["config"]) config_len = len(cpt["config"]) upsamplingRateDims = len(cpt["config"][12]) if config_len == 18 and upsamplingRateDims == 4: - print("[Voice Changer] RVC Model Type: Pitch-Less") + print("[Voice Changer] RVC Model Type: RVC_MODEL_TYPE_PITCH_LESS") self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_PITCH_LESS elif config_len == 18 and upsamplingRateDims == 5: - print("[Voice Changer] RVC Model Type: Normal") + print("[Voice Changer] RVC Model Type: RVC_MODEL_TYPE_NORMAL") self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_NORMAL elif config_len == 19: - print("[Voice Changer] RVC Model Type: Normal_768") - self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_NORMAL_768 + embedding = cpt["config"][17] + if embedding == 256: + print("[Voice Changer] RVC Model Type: RVC_MODEL_TYPE_WEBUI_256_NORMAL") + self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_WEBUI_256_NORMAL + else: + print("[Voice Changer] RVC Model Type: RVC_MODEL_TYPE_WEBUI_768_NORMAL") + self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_WEBUI_768_NORMAL else: print("[Voice Changer] RVC Model Type: UNKNOWN") self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_UNKNOWN @@ -193,7 +208,7 @@ class RVC: net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=self.is_half) elif self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_PITCH_LESS: net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) - elif self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_NORMAL_768: + elif self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_WEBUI_256_NORMAL or self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_WEBUI_768_NORMAL: net_g = SynthesizerTrnMs768NSFsid(**cpt["params"], is_half=self.is_half) else: print("unknwon") diff --git a/server/voice_changer/RVC/const.py b/server/voice_changer/RVC/const.py index aacb42df..0abd8bc3 100644 --- a/server/voice_changer/RVC/const.py +++ b/server/voice_changer/RVC/const.py @@ -1,4 +1,7 @@ RVC_MODEL_TYPE_NORMAL = 0 RVC_MODEL_TYPE_PITCH_LESS = 1 -RVC_MODEL_TYPE_NORMAL_768 = 2 +RVC_MODEL_TYPE_WEBUI_256_NORMAL = 2 +RVC_MODEL_TYPE_WEBUI_256_PITCHLESS = 3 +RVC_MODEL_TYPE_WEBUI_768_NORMAL = 4 +RVC_MODEL_TYPE_WEBUI_768_PITCHLESS = 5 RVC_MODEL_TYPE_UNKNOWN = 99 diff --git a/server/voice_changer/RVC/custom_vc_infer_pipeline.py b/server/voice_changer/RVC/custom_vc_infer_pipeline.py index 29419c17..5e15f7ab 100644 --- a/server/voice_changer/RVC/custom_vc_infer_pipeline.py +++ b/server/voice_changer/RVC/custom_vc_infer_pipeline.py @@ -10,7 +10,7 @@ import pyworld import os import traceback import faiss -from .const import RVC_MODEL_TYPE_NORMAL, RVC_MODEL_TYPE_PITCH_LESS, RVC_MODEL_TYPE_NORMAL_768 +from .const import RVC_MODEL_TYPE_NORMAL, RVC_MODEL_TYPE_PITCH_LESS, RVC_MODEL_TYPE_WEBUI_256_NORMAL, RVC_MODEL_TYPE_WEBUI_768_NORMAL class VC(object): @@ -94,7 +94,7 @@ class VC(object): assert feats.dim() == 1, feats.dim() feats = feats.view(1, -1) padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) - if modelType == RVC_MODEL_TYPE_NORMAL or modelType == RVC_MODEL_TYPE_PITCH_LESS: + if modelType == RVC_MODEL_TYPE_NORMAL or modelType == RVC_MODEL_TYPE_PITCH_LESS or modelType == RVC_MODEL_TYPE_WEBUI_256_NORMAL: inputs = { "source": feats.to(self.device), "padding_mask": padding_mask, @@ -109,7 +109,7 @@ class VC(object): t0 = ttime() with torch.no_grad(): logits = model.extract_features(**inputs) - if modelType == RVC_MODEL_TYPE_NORMAL or modelType == RVC_MODEL_TYPE_PITCH_LESS: + if modelType == RVC_MODEL_TYPE_NORMAL or modelType == RVC_MODEL_TYPE_PITCH_LESS or modelType == RVC_MODEL_TYPE_WEBUI_256_NORMAL: feats = model.final_proj(logits[0]) else: feats = logits[0] @@ -136,7 +136,7 @@ class VC(object): p_len = torch.tensor([p_len], device=self.device).long() with torch.no_grad(): - if modelType == RVC_MODEL_TYPE_NORMAL or modelType == RVC_MODEL_TYPE_NORMAL_768: + if modelType == RVC_MODEL_TYPE_NORMAL or modelType == RVC_MODEL_TYPE_WEBUI_256_NORMAL or modelType == RVC_MODEL_TYPE_WEBUI_768_NORMAL: audio1 = (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16) else: audio1 = (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)