WIP: support rvc-webui, refactoring

2023-04-25 03:03:38 +09:00 · 2023-04-25 03:03:38 +09:00 · 777c2d6e1e
commit 777c2d6e1e
parent 86798b3896
6 changed files with 62 additions and 32 deletions
--- a/client/demo/dist/assets/gui_settings/RVC.json
+++ b/client/demo/dist/assets/gui_settings/RVC.json
@ -40,6 +40,10 @@
            {
                "name": "onnxExecutor",
                "options": {}
+            },
+            {
+                "name": "modelSamplingRate",
+                "options": {}
            }
        ],
        "modelSetting": [
--- a/client/demo/dist/index.js
+++ b/client/demo/dist/index.js
--- a/client/demo/public/assets/gui_settings/RVC.json
+++ b/client/demo/public/assets/gui_settings/RVC.json
@ -40,6 +40,10 @@
            {
                "name": "onnxExecutor",
                "options": {}
+            },
+            {
+                "name": "modelSamplingRate",
+                "options": {}
            }
        ],
        "modelSetting": [
--- a/server/voice_changer/RVC/ModelWrapper.py
+++ b/server/voice_changer/RVC/ModelWrapper.py
@ -30,10 +30,13 @@ class ModelWrapper:
            self.embChannels = metadata["embChannels"]
            print(f"[Voice Changer] Onnx metadata: sr:{self.samplingRate}, f0:{self.f0}")
        except:
-            self.samplingRate = -1
+            self.samplingRate = 48000
            self.f0 = True
-            print(f"[Voice Changer] Onnx version is old. Please regenerate onnxfile. Fallback to default")
+            self.embChannels = 256
+            print(f"[Voice Changer] ############## !!!! CAUTION !!!! ####################")
+            print(f"[Voice Changer] This onnx's version is depricated. Please regenerate onnxfile. Fallback to default")
            print(f"[Voice Changer] Onnx metadata: sr:{self.samplingRate}, f0:{self.f0}")
+            print(f"[Voice Changer] ############## !!!! CAUTION !!!! ####################")

    def getSamplingRate(self):
        return self.samplingRate
--- a/server/voice_changer/RVC/RVC.py
+++ b/server/voice_changer/RVC/RVC.py
@ -79,7 +79,7 @@ class RVCSettings():
    rvcQuality: int = 0
    silenceFront: int = 1  # 0:off, 1:on
    modelSamplingRate: int = 48000
-    modelSlotIndex: int = 0
+    modelSlotIndex: int = -1

    speakers: dict[str, int] = field(
        default_factory=lambda: {}
@ -118,13 +118,30 @@ class RVC:
        params_str = props["params"]
        params = json.loads(params_str)

-        self.settings.modelSlots[self.tmp_slot] = ModelSlot(
-            pyTorchModelFile=props["files"]["pyTorchModelFilename"],
-            onnxModelFile=props["files"]["onnxModelFilename"],
-            featureFile=props["files"]["featureFilename"],
-            indexFile=props["files"]["indexFilename"],
-            defaultTrans=params["trans"]
-        )
+        # self.settings.modelSlots[self.tmp_slot] = ModelSlot(
+        #     pyTorchModelFile=props["files"]["pyTorchModelFilename"],
+        #     onnxModelFile=props["files"]["onnxModelFilename"],
+        #     featureFile=props["files"]["featureFilename"],
+        #     indexFile=props["files"]["indexFilename"],
+        #     defaultTrans=params["trans"]
+        # )
+
+        newSlot = asdict(self.settings.modelSlots[self.tmp_slot])
+        newSlot.update({
+            "pyTorchModelFile": props["files"]["pyTorchModelFilename"],
+            "onnxModelFile": props["files"]["onnxModelFilename"],
+            "featureFile": props["files"]["featureFilename"],
+            "indexFile": props["files"]["indexFilename"],
+            "defaultTrans": params["trans"]
+        })
+        # .update({
+        #     pyTorchModelFile: props["files"]["pyTorchModelFilename"],
+        #     onnxModelFile: props["files"]["onnxModelFilename"],
+        #     featureFile: props["files"]["featureFilename"],
+        #     indexFile: props["files"]["indexFilename"],
+        #     defaultTrans: params["trans"]
+        # })
+        self.settings.modelSlots[self.tmp_slot] = ModelSlot(**newSlot)

        print("[Voice Changer] RVC loading... slot:", self.tmp_slot)

@ -213,8 +230,8 @@ class RVC:
            self.next_onnx_session = ModelWrapper(onnxModelFile)
            self.settings.modelSlots[slot].samplingRateOnnx = self.next_onnx_session.getSamplingRate()
            self.settings.modelSlots[slot].f0Onnx = self.next_onnx_session.getF0()
-            if self.settings.modelSlots[slot].samplingRate == -1:  # ONNXにsampling rateが入っていない
-                self.settings.modelSlots[slot].samplingRate = self.settings.modelSamplingRate
+            # if self.settings.modelSlots[slot].samplingRate == -1:  # ONNXにsampling rateが入っていない
+            #     self.settings.modelSlots[slot].samplingRate = self.settings.modelSamplingRate
            self.settings.modelSlots[slot].embChannelsOnnx = self.next_onnx_session.getEmbChannels()

            # ONNXがある場合は、ONNXの設定を優先
@ -228,6 +245,8 @@ class RVC:
        self.next_feature_file = self.settings.modelSlots[slot].featureFile
        self.next_index_file = self.settings.modelSlots[slot].indexFile
        self.next_trans = self.settings.modelSlots[slot].defaultTrans
+        self.next_samplingRate = self.settings.modelSlots[slot].samplingRate
+        self.next_framework = "ONNX" if self.next_onnx_session != None else "PyTorch"
        print("[Voice Changer] Prepare done.",)
        return self.get_info()

@ -240,6 +259,8 @@ class RVC:
        self.feature_file = self.next_feature_file
        self.index_file = self.next_index_file
        self.settings.tran = self.next_trans
+        self.settings.framework = self.next_framework
+        self.settings.modelSamplingRate = self.next_samplingRate
        self.next_net_g = None
        self.next_onnx_session = None
        print("[Voice Changer] Switching model..done",)
@ -351,14 +372,13 @@ class RVC:
            file_index = self.index_file if self.index_file != None else ""
            file_big_npy = self.feature_file if self.feature_file != None else ""
            index_rate = self.settings.indexRatio
-            if_f0 = 1
+            if_f0 = 1 if self.settings.modelSlots[self.currentSlot].f0 else 0
            f0_file = None

            f0 = self.settings.modelSlots[self.currentSlot].f0
            embChannels = self.settings.modelSlots[self.currentSlot].embChannels
-            print("embChannels::1:", embChannels)
            audio_out = vc.pipeline(self.hubert_model, self.onnx_session, sid, audio, times, f0_up_key, f0_method,
-                                    file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file, silence_front=self.settings.extraConvertSize / self.settings.modelSamplingRate, f0=f0, embChannels=embChannels)
+                                    file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file, silence_front=self.settings.extraConvertSize / self.settings.modelSamplingRate, embChannels=embChannels)
            result = audio_out * np.sqrt(vol)

        return result
@ -399,22 +419,21 @@ class RVC:
            file_index = self.index_file if self.index_file != None else ""
            file_big_npy = self.feature_file if self.feature_file != None else ""
            index_rate = self.settings.indexRatio
-            if_f0 = 1
+            if_f0 = 1 if self.settings.modelSlots[self.currentSlot].f0 else 0
            f0_file = None

-            f0 = self.settings.modelSlots[self.currentSlot].f0
            embChannels = self.settings.modelSlots[self.currentSlot].embChannels
            audio_out = vc.pipeline(self.hubert_model, self.net_g, sid, audio, times, f0_up_key, f0_method,
-                                    file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file, silence_front=self.settings.extraConvertSize / self.settings.modelSamplingRate, f0=f0, embChannels=embChannels)
+                                    file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file, silence_front=self.settings.extraConvertSize / self.settings.modelSamplingRate, embChannels=embChannels)

            result = audio_out * np.sqrt(vol)

        return result

    def inference(self, data):
-        # if self.settings.modelSlotIndex < -1:
-        #     print("[Voice Changer] No model uploaded.")
-        #     raise NoModeLoadedException("model_common")
+        if self.settings.modelSlotIndex < -1:
+            print("[Voice Changer] No model uploaded.")
+            raise NoModeLoadedException("model_common")

        if self.currentSlot != self.settings.modelSlotIndex:
            print(f"Switch model {self.currentSlot} -> {self.settings.modelSlotIndex}")
--- a/server/voice_changer/RVC/custom_vc_infer_pipeline.py
+++ b/server/voice_changer/RVC/custom_vc_infer_pipeline.py
@ -84,7 +84,7 @@ class VC(object):
        f0_coarse = np.rint(f0_mel).astype(np.int)
        return f0_coarse, f0bak  # 1-0

-    def vc(self, model, net_g, sid, audio0, pitch, pitchf, times, index, big_npy, index_rate, f0=True, embChannels=256):  # ,file_index,file_big_npy
+    def vc(self, model, net_g, sid, audio0, pitch, pitchf, times, index, big_npy, index_rate, embChannels=256):  # ,file_index,file_big_npy
        feats = torch.from_numpy(audio0)
        if (self.is_half == True):
            feats = feats.half()
@ -137,7 +137,7 @@ class VC(object):
        p_len = torch.tensor([p_len], device=self.device).long()

        with torch.no_grad():
-            if f0 == True:
+            if pitch != None:
                audio1 = (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)
            else:
                if hasattr(net_g, "infer_pitchless"):
@ -154,7 +154,7 @@ class VC(object):
        times[2] += (t2 - t1)
        return audio1

-    def pipeline(self, model, net_g, sid, audio, times, f0_up_key, f0_method, file_index, file_big_npy, index_rate, if_f0, f0_file=None, silence_front=0, f0=True, embChannels=256):
+    def pipeline(self, model, net_g, sid, audio, times, f0_up_key, f0_method, file_index, file_big_npy, index_rate, if_f0, f0_file=None, silence_front=0, embChannels=256):
        if (file_big_npy != "" and file_index != "" and os.path.exists(file_big_npy) == True and os.path.exists(file_index) == True and index_rate != 0):
            try:
                index = faiss.read_index(file_index)
@ -185,10 +185,10 @@ class VC(object):
        times[1] += (t2 - t1)
        if self.t_pad_tgt == 0:
            audio_opt.append(self.vc(model, net_g, sid, audio_pad[t:], pitch[:, t // self.window:]if t is not None else pitch,
-                                     pitchf[:, t // self.window:]if t is not None else pitchf, times, index, big_npy, index_rate, f0, embChannels))
+                                     pitchf[:, t // self.window:]if t is not None else pitchf, times, index, big_npy, index_rate, embChannels))
        else:
            audio_opt.append(self.vc(model, net_g, sid, audio_pad[t:], pitch[:, t // self.window:]if t is not None else pitch,
-                                     pitchf[:, t // self.window:]if t is not None else pitchf, times, index, big_npy, index_rate, f0, embChannels)[self.t_pad_tgt:-self.t_pad_tgt])
+                                     pitchf[:, t // self.window:]if t is not None else pitchf, times, index, big_npy, index_rate, embChannels)[self.t_pad_tgt:-self.t_pad_tgt])

        audio_opt = np.concatenate(audio_opt)
        del pitch, pitchf, sid