WIP: support rvc-webui, pitch-less is not support yet

2023-04-24 05:54:36 +09:00 · 2023-04-24 05:54:36 +09:00 · 11a536b03f
commit 11a536b03f
parent acfb7b601a
6 changed files with 245 additions and 182 deletions
--- a/server/voice_changer/RVC/ModelWrapper.py
+++ b/server/voice_changer/RVC/ModelWrapper.py
@ -1,6 +1,7 @@
 import onnxruntime
 import torch
 import numpy as np
+import json
 # providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]
 providers = ["CPUExecutionProvider"]

@ -21,6 +22,23 @@ class ModelWrapper:
            self.is_half = False
        else:
            self.is_half = True
+        modelmeta = self.onnx_session.get_modelmeta()
+        try:
+            metadata = json.loads(modelmeta.custom_metadata_map["metadata"])
+            self.samplingRate = metadata["samplingRate"]
+            self.f0 = metadata["f0"]
+            print(f"[Voice Changer] Onnx metadata: sr:{self.samplingRate}, f0:{self.f0}")
+        except:
+            self.samplingRate = -1
+            self.f0 = True
+            print(f"[Voice Changer] Onnx version is old. Please regenerate onnxfile. Fallback to default")
+            print(f"[Voice Changer] Onnx metadata: sr:{self.samplingRate}, f0:{self.f0}")
+
+    def getSamplingRate(self):
+        return self.samplingRate
+
+    def getF0(self):
+        return self.f0

    def set_providers(self, providers, provider_options=[{}]):
        self.onnx_session.set_providers(providers=providers, provider_options=provider_options)
@ -28,14 +46,27 @@ class ModelWrapper:
    def get_providers(self):
        return self.onnx_session.get_providers()

+    def infer_pitchless(self, feats, p_len, sid):
+        if self.is_half:
+            audio1 = self.onnx_session.run(
+                ["audio"],
+                {
+                    "feats": feats.cpu().numpy().astype(np.float16),
+                    "p_len": p_len.cpu().numpy().astype(np.int64),
+                    "sid": sid.cpu().numpy().astype(np.int64),
+                })
+        else:
+            audio1 = self.onnx_session.run(
+                ["audio"],
+                {
+                    "feats": feats.cpu().numpy().astype(np.float32),
+                    "p_len": p_len.cpu().numpy().astype(np.int64),
+                    "sid": sid.cpu().numpy().astype(np.int64),
+                })
+        return torch.tensor(np.array(audio1))
+
    def infer(self, feats, p_len, pitch, pitchf, sid):
        if self.is_half:
-            # print("feats", feats.cpu().numpy().dtype)
-            # print("p_len", p_len.cpu().numpy().dtype)
-            # print("pitch", pitch.cpu().numpy().dtype)
-            # print("pitchf", pitchf.cpu().numpy().dtype)
-            # print("sid", sid.cpu().numpy().dtype)
-
            audio1 = self.onnx_session.run(
                ["audio"],
                {
--- a/server/voice_changer/RVC/RVC.py
+++ b/server/voice_changer/RVC/RVC.py
@ -32,7 +32,8 @@ import pyworld as pw
 from voice_changer.RVC.custom_vc_infer_pipeline import VC
 from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono
 from .models import SynthesizerTrnMsNSFsid as SynthesizerTrnMs768NSFsid
-from .const import RVC_MODEL_TYPE_NORMAL, RVC_MODEL_TYPE_PITCHLESS, RVC_MODEL_TYPE_WEBUI_256_NORMAL, RVC_MODEL_TYPE_WEBUI_768_NORMAL, RVC_MODEL_TYPE_WEBUI_256_PITCHLESS, RVC_MODEL_TYPE_WEBUI_768_PITCHLESS, RVC_MODEL_TYPE_UNKNOWN
+# from .const import RVC_MODEL_TYPE_NORMAL, RVC_MODEL_TYPE_PITCHLESS, RVC_MODEL_TYPE_WEBUI_256_NORMAL, RVC_MODEL_TYPE_WEBUI_768_NORMAL, RVC_MODEL_TYPE_WEBUI_256_PITCHLESS, RVC_MODEL_TYPE_WEBUI_768_PITCHLESS, RVC_MODEL_TYPE_UNKNOWN
+from .const import RVC_MODEL_TYPE_RVC, RVC_MODEL_TYPE_WEBUI
 from fairseq import checkpoint_utils
 providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]

@ -44,7 +45,12 @@ class ModelSlot():
    featureFile: str = ""
    indexFile: str = ""
    defaultTrans: int = ""
-    modelType: int = RVC_MODEL_TYPE_UNKNOWN
+    modelType: int = RVC_MODEL_TYPE_RVC
+    samplingRate: int = -1
+    f0: bool = True
+    embChannels: int = 256
+    samplingRateOnnx: int = -1
+    f0Onnx: bool = True


@dataclass
@ -119,8 +125,7 @@ class RVC:
            onnxModelFile=props["files"]["onnxModelFilename"],
            featureFile=props["files"]["featureFilename"],
            indexFile=props["files"]["indexFilename"],
-            defaultTrans=params["trans"],
-            modelType=RVC_MODEL_TYPE_UNKNOWN
+            defaultTrans=params["trans"]
        )

        print("[Voice Changer] RVC loading... slot:", self.tmp_slot)
@ -172,40 +177,53 @@ class RVC:
            # print("config shape:2::::", (cpt).keys)
            config_len = len(cpt["config"])
            upsamplingRateDims = len(cpt["config"][12])
-            if config_len == 18 and cpt["f0"] == 0:
-                print("[Voice Changer] RVC Model Type: RVC_MODEL_TYPE_PITCHLESS")
-                self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_PITCHLESS
-            elif config_len == 18 and cpt["f0"] == 1:
-                print("[Voice Changer] RVC Model Type: RVC_MODEL_TYPE_NORMAL")
-                self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_NORMAL
-            elif config_len == 19:
-                print("PARAMS:::::::::", cpt["params"])
-                embedding = cpt["config"][17]
-                if embedding == 256 and cpt["f0"] == 0:
-                    print("[Voice Changer] RVC Model Type: RVC_MODEL_TYPE_WEBUI_256_PITCHLESS")
-                    self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_WEBUI_256_PITCHLESS
-                elif embedding == 256 and cpt["f0"] == 1:
-                    print("[Voice Changer] RVC Model Type: RVC_MODEL_TYPE_WEBUI_256_NORMAL")
-                    self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_WEBUI_256_NORMAL
-                elif embedding == 768 and cpt["f0"] == 0:
-                    print("[Voice Changer] RVC Model Type: RVC_MODEL_TYPE_WEBUI_768_PITCHLESS")
-                    self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_WEBUI_768_PITCHLESS
-                else:
-                    print("[Voice Changer] RVC Model Type: RVC_MODEL_TYPE_WEBUI_768_NORMAL")
-                    self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_WEBUI_768_NORMAL
+            if config_len == 18:
+                self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_RVC
+                self.settings.modelSlots[slot].embChannels = 256
            else:
-                print("[Voice Changer] RVC Model Type: UNKNOWN")
-                self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_UNKNOWN
+                self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_WEBUI
+                self.settings.modelSlots[slot].embChannels = cpt["config"][17]
+            self.settings.modelSlots[slot].f0 = True if cpt["f0"] == 1 else False
+            self.settings.modelSlots[slot].samplingRate = cpt["config"][-1]

            self.settings.modelSamplingRate = cpt["config"][-1]

-            if self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_NORMAL:
+            # if config_len == 18 and cpt["f0"] == 0:
+            #     print("[Voice Changer] RVC Model Type: RVC_MODEL_TYPE_PITCHLESS")
+            #     self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_PITCHLESS
+            # elif config_len == 18 and cpt["f0"] == 1:
+            #     print("[Voice Changer] RVC Model Type: RVC_MODEL_TYPE_NORMAL")
+            #     self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_NORMAL
+            # elif config_len == 19:
+            #     print("PARAMS:::::::::", cpt["params"])
+            #     embedding = cpt["config"][17]
+            #     if embedding == 256 and cpt["f0"] == 0:
+            #         print("[Voice Changer] RVC Model Type: RVC_MODEL_TYPE_WEBUI_256_PITCHLESS")
+            #         self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_WEBUI_256_PITCHLESS
+            #     elif embedding == 256 and cpt["f0"] == 1:
+            #         print("[Voice Changer] RVC Model Type: RVC_MODEL_TYPE_WEBUI_256_NORMAL")
+            #         self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_WEBUI_256_NORMAL
+            #     elif embedding == 768 and cpt["f0"] == 0:
+            #         print("[Voice Changer] RVC Model Type: RVC_MODEL_TYPE_WEBUI_768_PITCHLESS")
+            #         self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_WEBUI_768_PITCHLESS
+            #     else:
+            #         print("[Voice Changer] RVC Model Type: RVC_MODEL_TYPE_WEBUI_768_NORMAL")
+            #         self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_WEBUI_768_NORMAL
+            # else:
+            #     print("[Voice Changer] RVC Model Type: UNKNOWN")
+            #     self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_UNKNOWN
+
+            if self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_RVC and self.settings.modelSlots[slot].f0 == True:
                net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=self.is_half)
-            elif self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_PITCHLESS:
+            elif self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_RVC and self.settings.modelSlots[slot].f0 == False:
                net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
-            elif self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_WEBUI_256_NORMAL or self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_WEBUI_768_NORMAL:
+            elif self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_WEBUI and self.settings.modelSlots[slot].f0 == True:
                net_g = SynthesizerTrnMs768NSFsid(**cpt["params"], is_half=self.is_half)
-            elif self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_WEBUI_256_PITCHLESS or self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_WEBUI_768_PITCHLESS:
+            elif self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_WEBUI and self.settings.modelSlots[slot].f0 == False:
+                ######################
+                # TBD
+                ######################
+                print("webui non-f0 is not supported yet")
                net_g = SynthesizerTrnMs768NSFsid(**cpt["params"], is_half=self.is_half)
            else:
                print("unknwon")
@ -221,6 +239,15 @@ class RVC:
        # ONNXモデル生成
        if onnxModelFile != None and onnxModelFile != "":
            self.next_onnx_session = ModelWrapper(onnxModelFile)
+            self.settings.modelSlots[slot].samplingRateOnnx = self.next_onnx_session.getSamplingRate()
+            self.settings.modelSlots[slot].f0Onnx = self.next_onnx_session.getF0()
+            if self.settings.modelSlots[slot].samplingRate == -1:  # ONNXにsampling rateが入っていない
+                self.settings.modelSlots[slot].samplingRate = self.settings.modelSamplingRate
+
+            # ONNXがある場合は、ONNXの設定を優先
+            self.settings.modelSlots[slot].samplingRate = self.settings.modelSlots[slot].samplingRateOnnx
+            self.settings.modelSlots[slot].f0 = self.settings.modelSlots[slot].f0Onnx
+
        else:
            self.next_onnx_session = None

@ -352,8 +379,10 @@ class RVC:
            if_f0 = 1
            f0_file = None

+            f0 = self.settings.modelSlots[self.currentSlot].f0
+            embChannels = self.settings.modelSlots[self.currentSlot].embChannels
            audio_out = vc.pipeline(self.hubert_model, self.onnx_session, sid, audio, times, f0_up_key, f0_method,
-                                    file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file)
+                                    file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file, silence_front=self.settings.extraConvertSize / self.settings.modelSamplingRate, f0=f0, embChannels=embChannels)
            result = audio_out * np.sqrt(vol)

        return result
@ -397,9 +426,11 @@ class RVC:
            if_f0 = 1
            f0_file = None

-            modelType = self.settings.modelSlots[self.currentSlot].modelType
+            f0 = self.settings.modelSlots[self.currentSlot].f0
+
+            embChannels = self.settings.modelSlots[self.currentSlot].embChannels
            audio_out = vc.pipeline(self.hubert_model, self.net_g, sid, audio, times, f0_up_key, f0_method,
-                                    file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file, silence_front=self.settings.extraConvertSize / self.settings.modelSamplingRate, modelType=modelType)
+                                    file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file, silence_front=self.settings.extraConvertSize / self.settings.modelSamplingRate, f0=f0, embChannels=embChannels)

            result = audio_out * np.sqrt(vol)

@ -454,11 +485,19 @@ class RVC:
        output_file_simple = os.path.splitext(os.path.basename(pyTorchModelFile))[0] + "_simple.onnx"
        output_path = os.path.join(TMP_DIR, output_file)
        output_path_simple = os.path.join(TMP_DIR, output_file_simple)
+        metadata = {
+            "application": "VC_CLIENT",
+            "version": "1",
+            "ModelType": self.settings.modelSlots[self.slot].modelType,
+            "samplingRate": self.settings.modelSlots[self.slot].samplingRate,
+            "f0": self.settings.modelSlots[self.slot].f0,
+            "embChannels": self.settings.modelSlots[self.slot].embChannels,
+        }

        if torch.cuda.device_count() > 0:
-            onnxExporter.export2onnx(pyTorchModelFile, output_path, output_path_simple, True)
+            onnxExporter.export2onnx(pyTorchModelFile, output_path, output_path_simple, True, metadata)
        else:
            print("[Voice Changer] Warning!!! onnx export with float32. maybe size is doubled.")
-            onnxExporter.export2onnx(pyTorchModelFile, output_path, output_path_simple, False)
+            onnxExporter.export2onnx(pyTorchModelFile, output_path, output_path_simple, False, metadata)

        return {"status": "ok", "path": f"/tmp/{output_file_simple}", "filename": output_file_simple}
--- a/server/voice_changer/RVC/const.py
+++ b/server/voice_changer/RVC/const.py
@ -1,7 +1,10 @@
-RVC_MODEL_TYPE_NORMAL = 0
-RVC_MODEL_TYPE_PITCHLESS = 1
-RVC_MODEL_TYPE_WEBUI_256_NORMAL = 2
-RVC_MODEL_TYPE_WEBUI_256_PITCHLESS = 3
-RVC_MODEL_TYPE_WEBUI_768_NORMAL = 4
-RVC_MODEL_TYPE_WEBUI_768_PITCHLESS = 5
-RVC_MODEL_TYPE_UNKNOWN = 99
+# RVC_MODEL_TYPE_NORMAL = 0
+# RVC_MODEL_TYPE_PITCHLESS = 1
+# RVC_MODEL_TYPE_WEBUI_256_NORMAL = 2
+# RVC_MODEL_TYPE_WEBUI_256_PITCHLESS = 3
+# RVC_MODEL_TYPE_WEBUI_768_NORMAL = 4
+# RVC_MODEL_TYPE_WEBUI_768_PITCHLESS = 5
+# RVC_MODEL_TYPE_UNKNOWN = 99
+
+RVC_MODEL_TYPE_RVC = 0
+RVC_MODEL_TYPE_WEBUI = 1
--- a/server/voice_changer/RVC/custom_vc_infer_pipeline.py
+++ b/server/voice_changer/RVC/custom_vc_infer_pipeline.py
@ -10,7 +10,8 @@ import pyworld
 import os
 import traceback
 import faiss
-from .const import RVC_MODEL_TYPE_NORMAL, RVC_MODEL_TYPE_PITCHLESS, RVC_MODEL_TYPE_WEBUI_256_NORMAL, RVC_MODEL_TYPE_WEBUI_768_NORMAL, RVC_MODEL_TYPE_WEBUI_256_PITCHLESS, RVC_MODEL_TYPE_WEBUI_768_PITCHLESS
+# from .const import RVC_MODEL_TYPE_NORMAL, RVC_MODEL_TYPE_PITCHLESS, RVC_MODEL_TYPE_WEBUI_256_NORMAL, RVC_MODEL_TYPE_WEBUI_768_NORMAL, RVC_MODEL_TYPE_WEBUI_256_PITCHLESS, RVC_MODEL_TYPE_WEBUI_768_PITCHLESS
+from .const import RVC_MODEL_TYPE_RVC, RVC_MODEL_TYPE_WEBUI


 class VC(object):
@ -83,7 +84,7 @@ class VC(object):
        f0_coarse = np.rint(f0_mel).astype(np.int)
        return f0_coarse, f0bak  # 1-0

-    def vc(self, model, net_g, sid, audio0, pitch, pitchf, times, index, big_npy, index_rate, modelType):  # ,file_index,file_big_npy
+    def vc(self, model, net_g, sid, audio0, pitch, pitchf, times, index, big_npy, index_rate, f0=True, embChannels=256):  # ,file_index,file_big_npy
        feats = torch.from_numpy(audio0)
        if (self.is_half == True):
            feats = feats.half()
@ -94,7 +95,7 @@ class VC(object):
        assert feats.dim() == 1, feats.dim()
        feats = feats.view(1, -1)
        padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
-        if modelType == RVC_MODEL_TYPE_NORMAL or modelType == RVC_MODEL_TYPE_PITCHLESS or modelType == RVC_MODEL_TYPE_WEBUI_256_NORMAL or modelType == RVC_MODEL_TYPE_WEBUI_256_PITCHLESS:
+        if embChannels == 256:
            inputs = {
                "source": feats.to(self.device),
                "padding_mask": padding_mask,
@ -109,11 +110,9 @@ class VC(object):
        t0 = ttime()
        with torch.no_grad():
            logits = model.extract_features(**inputs)
-            if modelType == RVC_MODEL_TYPE_NORMAL or modelType == RVC_MODEL_TYPE_PITCHLESS or modelType == RVC_MODEL_TYPE_WEBUI_256_NORMAL or modelType == RVC_MODEL_TYPE_WEBUI_256_PITCHLESS:
-                print("-------------------------256")
+            if embChannels == 256:
                feats = model.final_proj(logits[0])
            else:
-                print("-------------------------768")
                feats = logits[0]

        if (isinstance(index, type(None)) == False and isinstance(big_npy, type(None)) == False and index_rate != 0):
@ -138,10 +137,14 @@ class VC(object):
        p_len = torch.tensor([p_len], device=self.device).long()

        with torch.no_grad():
-            if modelType == RVC_MODEL_TYPE_NORMAL or modelType == RVC_MODEL_TYPE_WEBUI_256_NORMAL or modelType == RVC_MODEL_TYPE_WEBUI_768_NORMAL or modelType == RVC_MODEL_TYPE_WEBUI_256_PITCHLESS or modelType == RVC_MODEL_TYPE_WEBUI_768_PITCHLESS:
+            if f0 == True:
                audio1 = (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)
            else:
-                audio1 = (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)
+                if hasattr(net_g, "infer_pitchless"):
+                    audio1 = (net_g.infer_pitchless(feats, p_len, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)
+                else:
+                    audio1 = (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)
+
            # audio1 = (net_g.infer(feats, p_len, None, pitchf, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)

        del feats, p_len, padding_mask
@ -151,7 +154,7 @@ class VC(object):
        times[2] += (t2 - t1)
        return audio1

-    def pipeline(self, model, net_g, sid, audio, times, f0_up_key, f0_method, file_index, file_big_npy, index_rate, if_f0, f0_file=None, silence_front=0, modelType: int = RVC_MODEL_TYPE_NORMAL):
+    def pipeline(self, model, net_g, sid, audio, times, f0_up_key, f0_method, file_index, file_big_npy, index_rate, if_f0, f0_file=None, silence_front=0, f0=True, embChannels=256):
        if (file_big_npy != "" and file_index != "" and os.path.exists(file_big_npy) == True and os.path.exists(file_index) == True and index_rate != 0):
            try:
                index = faiss.read_index(file_index)
@ -182,10 +185,10 @@ class VC(object):
        times[1] += (t2 - t1)
        if self.t_pad_tgt == 0:
            audio_opt.append(self.vc(model, net_g, sid, audio_pad[t:], pitch[:, t // self.window:]if t is not None else pitch,
-                                     pitchf[:, t // self.window:]if t is not None else pitchf, times, index, big_npy, index_rate, modelType))
+                                     pitchf[:, t // self.window:]if t is not None else pitchf, times, index, big_npy, index_rate, f0, embChannels))
        else:
            audio_opt.append(self.vc(model, net_g, sid, audio_pad[t:], pitch[:, t // self.window:]if t is not None else pitch,
-                                     pitchf[:, t // self.window:]if t is not None else pitchf, times, index, big_npy, index_rate, modelType)[self.t_pad_tgt:-self.t_pad_tgt])
+                                     pitchf[:, t // self.window:]if t is not None else pitchf, times, index, big_npy, index_rate, f0, embChannels)[self.t_pad_tgt:-self.t_pad_tgt])

        audio_opt = np.concatenate(audio_opt)
        del pitch, pitchf, sid
--- a/server/voice_changer/RVC/export2onnx.py
+++ b/server/voice_changer/RVC/export2onnx.py
@ -1,13 +1,11 @@
-import sys
-import os
-import argparse
 from distutils.util import strtobool
+import json
 import torch
 from torch import nn
 from onnxsim import simplify
 import onnx

-from infer_pack.models import TextEncoder256, GeneratorNSF, PosteriorEncoder, ResidualCouplingBlock
+from infer_pack.models import TextEncoder256, GeneratorNSF, PosteriorEncoder, ResidualCouplingBlock, Generator


 class SynthesizerTrnMs256NSFsid_ONNX(nn.Module):
@ -98,14 +96,105 @@ class SynthesizerTrnMs256NSFsid_ONNX(nn.Module):
        return o, x_mask, (z, z_p, m_p, logs_p)


-def export2onnx(input_model, output_model, output_model_simple, is_half):
+class SynthesizerTrnMs256NSFsid_nono_ONNX(nn.Module):
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        gin_channels,
+        sr=None,
+        **kwargs
+    ):
+
+        super().__init__()
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim = spk_embed_dim
+        self.enc_p = TextEncoder256(
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout, f0=False
+        )
+        self.dec = Generator(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+
+    def forward(self, phone, phone_lengths, sid, max_len=None):
+        g = self.emb_g(sid).unsqueeze(-1)
+        m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
+        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+        z = self.flow(z_p, x_mask, g=g, reverse=True)
+        o = self.dec((z * x_mask)[:, :, :max_len], g=g)
+        return o, x_mask, (z, z_p, m_p, logs_p)
+
+
+def export2onnx(input_model, output_model, output_model_simple, is_half, metadata):
+
    cpt = torch.load(input_model, map_location="cpu")
    if is_half:
        dev = torch.device("cuda", index=0)
    else:
        dev = torch.device("cpu")

-    net_g_onnx = SynthesizerTrnMs256NSFsid_ONNX(*cpt["config"], is_half=is_half)
+    if metadata["f0"] == True:
+        net_g_onnx = SynthesizerTrnMs256NSFsid_ONNX(*cpt["config"], is_half=is_half)
+    elif metadata["f0"] == False:
+        net_g_onnx = SynthesizerTrnMs256NSFsid_nono_ONNX(*cpt["config"])
+
    net_g_onnx.eval().to(dev)
    net_g_onnx.load_state_dict(cpt["weight"], strict=False)
    if is_half:
@ -116,22 +205,22 @@ def export2onnx(input_model, output_model, output_model_simple, is_half):
    else:
        feats = torch.FloatTensor(1, 2192, 256).to(dev)
    p_len = torch.LongTensor([2192]).to(dev)
-    pitch = torch.zeros(1, 2192, dtype=torch.int64).to(dev)
-
-    pitchf = torch.FloatTensor(1, 2192).to(dev)
    sid = torch.LongTensor([0]).to(dev)

-    input_names = ["feats", "p_len", "pitch", "pitchf", "sid"]
+    if metadata["f0"] == True:
+        pitch = torch.zeros(1, 2192, dtype=torch.int64).to(dev)
+        pitchf = torch.FloatTensor(1, 2192).to(dev)
+        input_names = ["feats", "p_len", "pitch", "pitchf", "sid"]
+        inputs = (feats, p_len, pitch, pitchf, sid,)
+
+    else:
+        input_names = ["feats", "p_len", "sid"]
+        inputs = (feats, p_len, sid,)
+
    output_names = ["audio", ]

    torch.onnx.export(net_g_onnx,
-                      (
-                          feats,
-                          p_len,
-                          pitch,
-                          pitchf,
-                          sid,
-                      ),
+                      inputs,
                      output_model,
                      dynamic_axes={
                          "feats": [1],
@ -146,4 +235,7 @@ def export2onnx(input_model, output_model, output_model_simple, is_half):

    model_onnx2 = onnx.load(output_model)
    model_simp, check = simplify(model_onnx2)
+    meta = model_simp.metadata_props.add()
+    meta.key = "metadata"
+    meta.value = json.dumps(metadata)
    onnx.save(model_simp, output_model_simple)
--- a/server/voice_changer/RVC/models.py
+++ b/server/voice_changer/RVC/models.py
@ -169,108 +169,3 @@ class SynthesizerTrnMsNSFsid(nn.Module):
        z = self.flow(z_p, x_mask, g=g, reverse=True)
        o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
        return o, x_mask, (z, z_p, m_p, logs_p)
-
-
-class SynthesizerTrnMs256NSFSidNono(nn.Module):
-    def __init__(
-        self,
-        spec_channels,
-        segment_size,
-        inter_channels,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size,
-        p_dropout,
-        resblock,
-        resblock_kernel_sizes,
-        resblock_dilation_sizes,
-        upsample_rates,
-        upsample_initial_channel,
-        upsample_kernel_sizes,
-        spk_embed_dim,
-        gin_channels,
-        emb_channels,
-        sr=None,
-        **kwargs
-    ):
-        super().__init__()
-        self.spec_channels = spec_channels
-        self.inter_channels = inter_channels
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.resblock = resblock
-        self.resblock_kernel_sizes = resblock_kernel_sizes
-        self.resblock_dilation_sizes = resblock_dilation_sizes
-        self.upsample_rates = upsample_rates
-        self.upsample_initial_channel = upsample_initial_channel
-        self.upsample_kernel_sizes = upsample_kernel_sizes
-        self.segment_size = segment_size
-        self.gin_channels = gin_channels
-        self.emb_channels = emb_channels
-        # self.hop_length = hop_length#
-        self.spk_embed_dim = spk_embed_dim
-        self.enc_p = TextEncoder256(
-            inter_channels,
-            hidden_channels,
-            filter_channels,
-            emb_channels,
-            n_heads,
-            n_layers,
-            kernel_size,
-            p_dropout,
-            f0=False,
-        )
-        self.dec = Generator(
-            inter_channels,
-            resblock,
-            resblock_kernel_sizes,
-            resblock_dilation_sizes,
-            upsample_rates,
-            upsample_initial_channel,
-            upsample_kernel_sizes,
-            gin_channels=gin_channels,
-        )
-        self.enc_q = PosteriorEncoder(
-            spec_channels,
-            inter_channels,
-            hidden_channels,
-            5,
-            1,
-            16,
-            gin_channels=gin_channels,
-        )
-        self.flow = ResidualCouplingBlock(
-            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
-        )
-        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
-        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
-
-    def remove_weight_norm(self):
-        self.dec.remove_weight_norm()
-        self.flow.remove_weight_norm()
-        self.enc_q.remove_weight_norm()
-
-    def forward(self, phone, phone_lengths, y, y_lengths, ds):  # 这里ds是id，[bs,1]
-        g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
-        m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
-        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
-        z_p = self.flow(z, y_mask, g=g)
-        z_slice, ids_slice = commons.rand_slice_segments(
-            z, y_lengths, self.segment_size
-        )
-        o = self.dec(z_slice, g=g)
-        return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
-
-    def infer(self, phone, phone_lengths, sid, max_len=None):
-        g = self.emb_g(sid).unsqueeze(-1)
-        m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
-        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
-        z = self.flow(z_p, x_mask, g=g, reverse=True)
-        o = self.dec((z * x_mask)[:, :, :max_len], g=g)
-        return o, x_mask, (z, z_p, m_p, logs_p)