WIP: support rvc-webui

2023-04-23 19:36:41 +09:00 · 2023-04-23 19:36:41 +09:00 · d3823183d6
commit d3823183d6
parent 2fa33aad8d
3 changed files with 40 additions and 22 deletions
--- a/server/voice_changer/RVC/RVC.py
+++ b/server/voice_changer/RVC/RVC.py
@ -32,7 +32,7 @@ import pyworld as pw
 from voice_changer.RVC.custom_vc_infer_pipeline import VC
 from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono
 from .models import SynthesizerTrnMsNSFsid as SynthesizerTrnMs768NSFsid
-from .const import RVC_MODEL_TYPE_NORMAL, RVC_MODEL_TYPE_PITCH_LESS, RVC_MODEL_TYPE_NORMAL_768, RVC_MODEL_TYPE_UNKNOWN
+from .const import RVC_MODEL_TYPE_NORMAL, RVC_MODEL_TYPE_PITCH_LESS, RVC_MODEL_TYPE_WEBUI_256_NORMAL, RVC_MODEL_TYPE_WEBUI_768_NORMAL, RVC_MODEL_TYPE_UNKNOWN
 from fairseq import checkpoint_utils
 providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]

@ -155,34 +155,49 @@ class RVC:
        if pyTorchModelFile != None and pyTorchModelFile != "":
            cpt = torch.load(pyTorchModelFile, map_location="cpu")
            '''
-            ※ ノーマル or Pitchレス判定 ⇒ コンフィグのpsamplingrateの形状から判断
-            ■ ノーマル
-            [1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 48000]
-            ■ピッチレス
-            [1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 　512, [16, 16, 4, 4],109, 256, 40000]
-
-            12番目の要素upsamplingrateの数で判定。4: ピッチレス, 5:ノーマル
-
-            
-            ※ 256 or 768判定 ⇒ config全体の形状
+            (1) オリジナルとrvc-webuiのモデル判定 ⇒ config全体の形状
            ■ ノーマル256
            [1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 48000]
            ■ ノーマル 768対応
            [1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 768, 48000]
-            config全体の長さで判定 ⇒ config全体の形状
+            ⇒ 18: オリジナル, 19: rvc-webui
+
+            (2-1) オリジナルのノーマルorPitchレス判定 ⇒ コンフィグのpsamplingrateの形状から判断
+            ■ ノーマル
+            [1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 48000]
+            ■ピッチレス
+            [1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 　512, [16, 16, 4, 4],109, 256, 40000]
+            12番目の要素upsamplingrateの数で判定。4: ピッチレス, 5:ノーマル
+
+            (2-2) rvc-webuiの、(256 or 768) x (ノーマルor pitchレス)判定 ⇒ 256, or 768 は17番目の要素で判定。
+            ■ 256 x ノーマル
+            [1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 256, 48000]
+            ■ 256 x pitchレス
+            [1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 256, 48000]
+            ■ 768 x ノーマル
+            [1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 768, 48000]
+            ■ 768 x pitchレス
+            [1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 768, 48000]
+            
            '''
+            print("config shape:::::", cpt["config"])

            config_len = len(cpt["config"])
            upsamplingRateDims = len(cpt["config"][12])
            if config_len == 18 and upsamplingRateDims == 4:
-                print("[Voice Changer] RVC Model Type: Pitch-Less")
+                print("[Voice Changer] RVC Model Type: RVC_MODEL_TYPE_PITCH_LESS")
                self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_PITCH_LESS
            elif config_len == 18 and upsamplingRateDims == 5:
-                print("[Voice Changer] RVC Model Type: Normal")
+                print("[Voice Changer] RVC Model Type: RVC_MODEL_TYPE_NORMAL")
                self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_NORMAL
            elif config_len == 19:
-                print("[Voice Changer] RVC Model Type: Normal_768")
-                self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_NORMAL_768
+                embedding = cpt["config"][17]
+                if embedding == 256:
+                    print("[Voice Changer] RVC Model Type: RVC_MODEL_TYPE_WEBUI_256_NORMAL")
+                    self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_WEBUI_256_NORMAL
+                else:
+                    print("[Voice Changer] RVC Model Type: RVC_MODEL_TYPE_WEBUI_768_NORMAL")
+                    self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_WEBUI_768_NORMAL
            else:
                print("[Voice Changer] RVC Model Type: UNKNOWN")
                self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_UNKNOWN
@ -193,7 +208,7 @@ class RVC:
                net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=self.is_half)
            elif self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_PITCH_LESS:
                net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
-            elif self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_NORMAL_768:
+            elif self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_WEBUI_256_NORMAL or self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_WEBUI_768_NORMAL:
                net_g = SynthesizerTrnMs768NSFsid(**cpt["params"], is_half=self.is_half)
            else:
                print("unknwon")
--- a/server/voice_changer/RVC/const.py
+++ b/server/voice_changer/RVC/const.py
@ -1,4 +1,7 @@
 RVC_MODEL_TYPE_NORMAL = 0
 RVC_MODEL_TYPE_PITCH_LESS = 1
-RVC_MODEL_TYPE_NORMAL_768 = 2
+RVC_MODEL_TYPE_WEBUI_256_NORMAL = 2
+RVC_MODEL_TYPE_WEBUI_256_PITCHLESS = 3
+RVC_MODEL_TYPE_WEBUI_768_NORMAL = 4
+RVC_MODEL_TYPE_WEBUI_768_PITCHLESS = 5
 RVC_MODEL_TYPE_UNKNOWN = 99
--- a/server/voice_changer/RVC/custom_vc_infer_pipeline.py
+++ b/server/voice_changer/RVC/custom_vc_infer_pipeline.py
@ -10,7 +10,7 @@ import pyworld
 import os
 import traceback
 import faiss
-from .const import RVC_MODEL_TYPE_NORMAL, RVC_MODEL_TYPE_PITCH_LESS, RVC_MODEL_TYPE_NORMAL_768
+from .const import RVC_MODEL_TYPE_NORMAL, RVC_MODEL_TYPE_PITCH_LESS, RVC_MODEL_TYPE_WEBUI_256_NORMAL, RVC_MODEL_TYPE_WEBUI_768_NORMAL


 class VC(object):
@ -94,7 +94,7 @@ class VC(object):
        assert feats.dim() == 1, feats.dim()
        feats = feats.view(1, -1)
        padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
-        if modelType == RVC_MODEL_TYPE_NORMAL or modelType == RVC_MODEL_TYPE_PITCH_LESS:
+        if modelType == RVC_MODEL_TYPE_NORMAL or modelType == RVC_MODEL_TYPE_PITCH_LESS or modelType == RVC_MODEL_TYPE_WEBUI_256_NORMAL:
            inputs = {
                "source": feats.to(self.device),
                "padding_mask": padding_mask,
@ -109,7 +109,7 @@ class VC(object):
        t0 = ttime()
        with torch.no_grad():
            logits = model.extract_features(**inputs)
-            if modelType == RVC_MODEL_TYPE_NORMAL or modelType == RVC_MODEL_TYPE_PITCH_LESS:
+            if modelType == RVC_MODEL_TYPE_NORMAL or modelType == RVC_MODEL_TYPE_PITCH_LESS or modelType == RVC_MODEL_TYPE_WEBUI_256_NORMAL:
                feats = model.final_proj(logits[0])
            else:
                feats = logits[0]
@ -136,7 +136,7 @@ class VC(object):
        p_len = torch.tensor([p_len], device=self.device).long()

        with torch.no_grad():
-            if modelType == RVC_MODEL_TYPE_NORMAL or modelType == RVC_MODEL_TYPE_NORMAL_768:
+            if modelType == RVC_MODEL_TYPE_NORMAL or modelType == RVC_MODEL_TYPE_WEBUI_256_NORMAL or modelType == RVC_MODEL_TYPE_WEBUI_768_NORMAL:
                audio1 = (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)
            else:
                audio1 = (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)