From 72fb482dc740420b9b7bee17e08781911daf5e52 Mon Sep 17 00:00:00 2001 From: wataru Date: Tue, 2 May 2023 20:57:12 +0900 Subject: [PATCH] WIP: Japanese Hubert --- server/const.py | 19 +- server/voice_changer/RVC/ModelSlot.py | 8 +- .../voice_changer/RVC/ModelSlotGenerator.py | 88 +++++ server/voice_changer/RVC/RVC.py | 350 ++++++------------ server/voice_changer/RVC/RVCSettings.py | 3 +- .../RVC/{ModelWrapper.py => _ModelWrapper.py} | 0 server/voice_changer/RVC/const.py | 2 - server/voice_changer/RVC/embedder/Embedder.py | 21 +- .../RVC/embedder/EmbedderManager.py | 2 - .../RVC/embedder/FairseqContentvec.py | 2 +- .../RVC/embedder/FairseqHubert.py | 5 +- .../RVC/embedder/FairseqHubertJp.py | 2 +- .../RVC/inferencer/Inferencer.py | 58 +++ .../RVC/inferencer/InferencerManager.py | 42 +++ .../RVC/inferencer/OnnxRVCInferencer.py | 78 ++++ .../RVC/inferencer/OnnxRVCInferencerNono.py | 71 ++++ .../RVC/inferencer/RVCInferencer.py | 33 ++ .../RVC/inferencer/RVCInferencerNono.py | 33 ++ .../RVC/inferencer/WebUIInferencer.py | 31 ++ .../RVC/inferencer/WebUIInferencerNono.py | 31 ++ server/voice_changer/RVC/inferencer/models.py | 277 ++++++++++++++ .../RVC/pipeline/PipelineGenerator.py | 7 + 22 files changed, 915 insertions(+), 248 deletions(-) create mode 100644 server/voice_changer/RVC/ModelSlotGenerator.py rename server/voice_changer/RVC/{ModelWrapper.py => _ModelWrapper.py} (100%) delete mode 100644 server/voice_changer/RVC/const.py create mode 100644 server/voice_changer/RVC/inferencer/Inferencer.py create mode 100644 server/voice_changer/RVC/inferencer/InferencerManager.py create mode 100644 server/voice_changer/RVC/inferencer/OnnxRVCInferencer.py create mode 100644 server/voice_changer/RVC/inferencer/OnnxRVCInferencerNono.py create mode 100644 server/voice_changer/RVC/inferencer/RVCInferencer.py create mode 100644 server/voice_changer/RVC/inferencer/RVCInferencerNono.py create mode 100644 server/voice_changer/RVC/inferencer/WebUIInferencer.py create mode 100644 server/voice_changer/RVC/inferencer/WebUIInferencerNono.py create mode 100644 server/voice_changer/RVC/inferencer/models.py create mode 100644 server/voice_changer/RVC/pipeline/PipelineGenerator.py diff --git a/server/const.py b/server/const.py index 1f7cbfe5..6ba67521 100644 --- a/server/const.py +++ b/server/const.py @@ -66,7 +66,22 @@ def getFrontendPath(): return frontend_path +# "hubert_base", "contentvec", "distilhubert" class EnumEmbedderTypes(Enum): - hubert = "hubert" + hubert = "hubert_base" contentvec = "contentvec" - hubert_jp = "hubert_jp" + hubert_jp = "hubert-base-japanese" + + +class EnumInferenceTypes(Enum): + pyTorchRVC = "pyTorchRVC" + pyTorchRVCNono = "pyTorchRVCNono" + pyTorchWebUI = "pyTorchWebUI" + pyTorchWebUINono = "pyTorchWebUINono" + onnxRVC = "onnxRVC" + onnxRVCNono = "onnxRVCNono" + + +class EnumFrameworkTypes(Enum): + pyTorch = "pyTorch" + onnx = "onnx" diff --git a/server/voice_changer/RVC/ModelSlot.py b/server/voice_changer/RVC/ModelSlot.py index 14619910..d02c788f 100644 --- a/server/voice_changer/RVC/ModelSlot.py +++ b/server/voice_changer/RVC/ModelSlot.py @@ -1,5 +1,6 @@ +from const import EnumInferenceTypes, EnumEmbedderTypes + from dataclasses import dataclass -from voice_changer.RVC.const import RVC_MODEL_TYPE_RVC @dataclass @@ -9,9 +10,10 @@ class ModelSlot: featureFile: str = "" indexFile: str = "" defaultTrans: int = 0 - modelType: int = RVC_MODEL_TYPE_RVC + isONNX: bool = False + modelType: EnumInferenceTypes = EnumInferenceTypes.pyTorchRVC samplingRate: int = -1 f0: bool = True embChannels: int = 256 deprecated: bool = False - embedder: str = "hubert_base" # "hubert_base", "contentvec", "distilhubert" + embedder: EnumEmbedderTypes = EnumEmbedderTypes.hubert diff --git a/server/voice_changer/RVC/ModelSlotGenerator.py b/server/voice_changer/RVC/ModelSlotGenerator.py new file mode 100644 index 00000000..036aa0f6 --- /dev/null +++ b/server/voice_changer/RVC/ModelSlotGenerator.py @@ -0,0 +1,88 @@ +from const import EnumEmbedderTypes, EnumInferenceTypes +from voice_changer.RVC.ModelSlot import ModelSlot + +from voice_changer.utils.LoadModelParams import FilePaths +import torch +import onnxruntime +import json + + +def generateModelSlot(files: FilePaths, params): + modelSlot = ModelSlot() + modelSlot.pyTorchModelFile = files.pyTorchModelFilename + modelSlot.onnxModelFile = files.onnxModelFilename + modelSlot.featureFile = files.featureFilename + modelSlot.indexFile = files.indexFilename + modelSlot.defaultTrans = params["trans"] if "trans" in params else 0 + + modelSlot.isONNX = True if modelSlot.onnxModelFile is not None else False + + if modelSlot.isONNX: + _setInfoByONNX(modelSlot, modelSlot.onnxModelFile) + else: + _setInfoByPytorch(modelSlot, modelSlot.pyTorchModelFile) + return modelSlot + + +def _setInfoByPytorch(slot: ModelSlot, file: str): + cpt = torch.load(file, map_location="cpu") + config_len = len(cpt["config"]) + if config_len == 18: + slot.f0 = True if cpt["f0"] == 1 else False + slot.modelType = ( + EnumInferenceTypes.pyTorchRVC + if slot.f0 + else EnumInferenceTypes.pyTorchRVCNono + ) + slot.embChannels = 256 + slot.embedder = EnumEmbedderTypes.hubert + else: + slot.f0 = True if cpt["f0"] == 1 else False + slot.modelType = ( + EnumInferenceTypes.pyTorchWebUI + if slot.f0 + else EnumInferenceTypes.pyTorchWebUINono + ) + slot.embChannels = cpt["config"][17] + slot.embedder = cpt["embedder_name"] + if slot.embedder.endswith("768"): + slot.embedder = slot.embedder[:-3] + + slot.samplingRate = cpt["config"][-1] + + del cpt + + +def _setInfoByONNX(slot: ModelSlot, file: str): + tmp_onnx_session = onnxruntime.InferenceSession( + file, providers=["CPUExecutionProvider"] + ) + modelmeta = tmp_onnx_session.get_modelmeta() + try: + metadata = json.loads(modelmeta.custom_metadata_map["metadata"]) + + slot.modelType = metadata["modelType"] + slot.embChannels = metadata["embChannels"] + slot.embedder = ( + metadata["embedder"] if "embedder" in metadata else EnumEmbedderTypes.hubert + ) + slot.f0 = metadata["f0"] + slot.modelType = ( + EnumInferenceTypes.onnxRVC if slot.f0 else EnumInferenceTypes.onnxRVCNono + ) + slot.samplingRate = metadata["samplingRate"] + slot.deprecated = False + + except: + slot.modelType = EnumInferenceTypes.onnxRVC + slot.embChannels = 256 + slot.embedder = EnumEmbedderTypes.hubert + slot.f0 = True + slot.samplingRate = 48000 + slot.deprecated = True + + print("[Voice Changer] ############## !!!! CAUTION !!!! ####################") + print("[Voice Changer] This onnxfie is depricated. Please regenerate onnxfile.") + print("[Voice Changer] ############## !!!! CAUTION !!!! ####################") + + del tmp_onnx_session diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py index 7bde7b7a..7f8816c7 100644 --- a/server/voice_changer/RVC/RVC.py +++ b/server/voice_changer/RVC/RVC.py @@ -1,29 +1,5 @@ import sys import os -import json -import resampy -from voice_changer.RVC.MergeModel import merge_model -from voice_changer.RVC.MergeModelRequest import MergeModelRequest -from voice_changer.RVC.ModelWrapper import ModelWrapper -from Exceptions import NoModeLoadedException -from voice_changer.RVC.RVCSettings import RVCSettings -from voice_changer.RVC.embedder.Embedder import Embedder -from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager -from voice_changer.utils.LoadModelParams import FilePaths, LoadModelParams -from voice_changer.utils.VoiceChangerModel import AudioInOut -from voice_changer.utils.VoiceChangerParams import VoiceChangerParams - -from dataclasses import asdict -from typing import cast -import numpy as np -import torch - -# from fairseq import checkpoint_utils -import traceback -import faiss - -from const import TMP_DIR, UPLOAD_DIR # type:ignore - # avoiding parse arg error in RVC sys.argv = ["MMVCServerSIO.py"] @@ -37,16 +13,35 @@ if sys.platform.startswith("darwin"): sys.path.append(modulePath) else: sys.path.append("RVC") +import json +import resampy +from voice_changer.RVC.MergeModel import merge_model +from voice_changer.RVC.MergeModelRequest import MergeModelRequest +from voice_changer.RVC.ModelSlotGenerator import generateModelSlot +from Exceptions import NoModeLoadedException +from voice_changer.RVC.RVCSettings import RVCSettings +from voice_changer.RVC.embedder.Embedder import Embedder +from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager +from voice_changer.RVC.inferencer.Inferencer import Inferencer +from voice_changer.RVC.inferencer.InferencerManager import InferencerManager +from voice_changer.utils.LoadModelParams import FilePaths, LoadModelParams +from voice_changer.utils.VoiceChangerModel import AudioInOut +from voice_changer.utils.VoiceChangerParams import VoiceChangerParams + +from dataclasses import asdict +from typing import cast +import numpy as np +import torch + + +# from fairseq import checkpoint_utils +import traceback +import faiss + +from const import TMP_DIR, UPLOAD_DIR -from .models import SynthesizerTrnMsNSFsid as SynthesizerTrnMsNSFsid_webui -from .models import SynthesizerTrnMsNSFsidNono as SynthesizerTrnMsNSFsidNono_webui -from .const import RVC_MODEL_TYPE_RVC, RVC_MODEL_TYPE_WEBUI from voice_changer.RVC.custom_vc_infer_pipeline import VC -from infer_pack.models import ( # type:ignore - SynthesizerTrnMs256NSFsid, - SynthesizerTrnMs256NSFsid_nono, -) providers = [ "OpenVINOExecutionProvider", @@ -59,13 +54,12 @@ providers = [ class RVC: audio_buffer: AudioInOut | None = None embedder: Embedder | None = None + inferencer: Inferencer | None = None def __init__(self, params: VoiceChangerParams): self.initialLoad = True self.settings = RVCSettings() - self.net_g = None - self.onnx_session = None self.feature_file = None self.index_file = None @@ -83,173 +77,66 @@ class RVC: def loadModel(self, props: LoadModelParams): """ - loadModelはスロットへのエントリ(推論向けにはロードしない)。 - 例外的に、まだ一つも推論向けにロードされていない場合は、ロードする。 + loadModelはスロットへのエントリ(推論向けにはロードしない)。 + 例外的に、まだ一つも推論向けにロードされていない場合と稼働中スロットの場合は、ロードする。 """ self.is_half = props.isHalf - tmp_slot = props.slot + target_slot_idx = props.slot params_str = props.params params = json.loads(params_str) - self.settings.modelSlots[ - tmp_slot - ].pyTorchModelFile = props.files.pyTorchModelFilename - self.settings.modelSlots[tmp_slot].onnxModelFile = props.files.onnxModelFilename - self.settings.modelSlots[tmp_slot].featureFile = props.files.featureFilename - self.settings.modelSlots[tmp_slot].indexFile = props.files.indexFilename - self.settings.modelSlots[tmp_slot].defaultTrans = ( - params["trans"] if "trans" in params else 0 - ) - - isONNX = ( - True - if self.settings.modelSlots[tmp_slot].onnxModelFile is not None - else False - ) - - # メタデータ設定 - if isONNX: - self._setInfoByONNX( - tmp_slot, self.settings.modelSlots[tmp_slot].onnxModelFile - ) - else: - self._setInfoByPytorch( - tmp_slot, self.settings.modelSlots[tmp_slot].pyTorchModelFile - ) - + modelSlot = generateModelSlot(props.files, params) + self.settings.modelSlots[target_slot_idx] = modelSlot print( - f"[Voice Changer] RVC loading... slot:{tmp_slot}", - asdict(self.settings.modelSlots[tmp_slot]), + f"[Voice Changer] RVC new model is uploaded,{target_slot_idx}", + asdict(modelSlot), ) - # hubertロード - # try: - # hubert_path = self.params.hubert_base - # hubert_path_jp = self.params.hubert_base_jp - # print(hubert_path, hubert_path_jp) - - # models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( - # [hubert_path], - # suffix="", - # ) - # model = models[0] - # model.eval() - # if self.is_half: - # model = model.half() - # self.hubert_model = model - - # except Exception as e: - # print("EXCEPTION during loading hubert/contentvec model", e) - # print(" hubert_path:", hubert_path) # 初回のみロード - if self.initialLoad or tmp_slot == self.currentSlot: - self.prepareModel(tmp_slot) - self.settings.modelSlotIndex = tmp_slot - self.currentSlot = self.settings.modelSlotIndex + if self.initialLoad or target_slot_idx == self.currentSlot: + self.prepareModel(target_slot_idx) + self.settings.modelSlotIndex = target_slot_idx + # self.currentSlot = self.settings.modelSlotIndex self.switchModel() self.initialLoad = False return self.get_info() - def _setInfoByPytorch(self, slot, file): - cpt = torch.load(file, map_location="cpu") - config_len = len(cpt["config"]) - if config_len == 18: - self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_RVC - self.settings.modelSlots[slot].embChannels = 256 - self.settings.modelSlots[slot].embedder = "hubert_base" - else: - self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_WEBUI - self.settings.modelSlots[slot].embChannels = cpt["config"][17] - self.settings.modelSlots[slot].embedder = cpt["embedder_name"] - if self.settings.modelSlots[slot].embedder.endswith("768"): - self.settings.modelSlots[slot].embedder = self.settings.modelSlots[ - slot - ].embedder[:-3] - - self.settings.modelSlots[slot].f0 = True if cpt["f0"] == 1 else False - self.settings.modelSlots[slot].samplingRate = cpt["config"][-1] - - # self.settings.modelSamplingRate = cpt["config"][-1] - - def _setInfoByONNX(self, slot, file): - tmp_onnx_session = ModelWrapper(file) - self.settings.modelSlots[slot].modelType = tmp_onnx_session.getModelType() - self.settings.modelSlots[slot].embChannels = tmp_onnx_session.getEmbChannels() - self.settings.modelSlots[slot].embedder = tmp_onnx_session.getEmbedder() - self.settings.modelSlots[slot].f0 = tmp_onnx_session.getF0() - self.settings.modelSlots[slot].samplingRate = tmp_onnx_session.getSamplingRate() - self.settings.modelSlots[slot].deprecated = tmp_onnx_session.getDeprecated() - def prepareModel(self, slot: int): if slot < 0: return self.get_info() print("[Voice Changer] Prepare Model of slot:", slot) - onnxModelFile = self.settings.modelSlots[slot].onnxModelFile - isONNX = ( - True if self.settings.modelSlots[slot].onnxModelFile is not None else False + modelSlot = self.settings.modelSlots[slot] + filename = ( + modelSlot.onnxModelFile if modelSlot.isONNX else modelSlot.pyTorchModelFile ) - # モデルのロード - if isONNX: - print("[Voice Changer] Loading ONNX Model...") - self.next_onnx_session = ModelWrapper(onnxModelFile) - self.next_net_g = None + if self.settings.gpu < 0 or (self.gpu_num == 0 and self.mps_enabled is False): + dev = torch.device("cpu") + elif self.mps_enabled: + dev = torch.device("mps") else: - print("[Voice Changer] Loading Pytorch Model...") - torchModelSlot = self.settings.modelSlots[slot] - cpt = torch.load(torchModelSlot.pyTorchModelFile, map_location="cpu") + dev = torch.device("cuda", index=self.settings.gpu) - if ( - torchModelSlot.modelType == RVC_MODEL_TYPE_RVC - and torchModelSlot.f0 is True - ): - net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=self.is_half) - elif ( - torchModelSlot.modelType == RVC_MODEL_TYPE_RVC - and torchModelSlot.f0 is False - ): - net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) - elif ( - torchModelSlot.modelType == RVC_MODEL_TYPE_WEBUI - and torchModelSlot.f0 is True - ): - net_g = SynthesizerTrnMsNSFsid_webui( - **cpt["params"], is_half=self.is_half - ) - else: - net_g = SynthesizerTrnMsNSFsidNono_webui( - **cpt["params"], is_half=self.is_half - ) - net_g.eval() - net_g.load_state_dict(cpt["weight"], strict=False) - - if self.is_half: - net_g = net_g.half() - - self.next_net_g = net_g - self.next_onnx_session = None + # Inferencerのロード + inferencer = InferencerManager.getInferencer( + modelSlot.modelType, + filename, + self.settings.isHalf, + torch.device("cuda:0"), + ) + self.next_inferencer = inferencer # Indexのロード print("[Voice Changer] Loading index...") - self.next_feature_file = self.settings.modelSlots[slot].featureFile - self.next_index_file = self.settings.modelSlots[slot].indexFile - - if ( - self.settings.modelSlots[slot].featureFile is not None - and self.settings.modelSlots[slot].indexFile is not None - ): + if modelSlot.featureFile is not None and modelSlot.indexFile is not None: if ( - os.path.exists(self.settings.modelSlots[slot].featureFile) is True - and os.path.exists(self.settings.modelSlots[slot].indexFile) is True + os.path.exists(modelSlot.featureFile) is True + and os.path.exists(modelSlot.indexFile) is True ): try: - self.next_index = faiss.read_index( - self.settings.modelSlots[slot].indexFile - ) - self.next_feature = np.load( - self.settings.modelSlots[slot].featureFile - ) + self.next_index = faiss.read_index(modelSlot.indexFile) + self.next_feature = np.load(modelSlot.featureFile) except: print("[Voice Changer] load index failed. Use no index.") traceback.print_exc() @@ -260,12 +147,10 @@ class RVC: else: self.next_index = self.next_feature = None - self.next_trans = self.settings.modelSlots[slot].defaultTrans - self.next_samplingRate = self.settings.modelSlots[slot].samplingRate - self.next_embedder = self.settings.modelSlots[slot].embedder - self.next_framework = ( - "ONNX" if self.next_onnx_session is not None else "PyTorch" - ) + self.next_trans = modelSlot.defaultTrans + self.next_samplingRate = modelSlot.samplingRate + self.next_embedder = modelSlot.embedder + self.next_framework = "ONNX" if modelSlot.isONNX else "PyTorch" print("[Voice Changer] Prepare done.") return self.get_info() @@ -284,15 +169,13 @@ class RVC: print("[Voice Changer] load hubert error", e) traceback.print_exc() - self.net_g = self.next_net_g - self.onnx_session = self.next_onnx_session - self.feature_file = self.next_feature_file - self.index_file = self.next_index_file + self.inferencer = self.next_inferencer self.feature = self.next_feature self.index = self.next_index self.settings.tran = self.next_trans self.settings.framework = self.next_framework self.settings.modelSamplingRate = self.next_samplingRate + self.next_net_g = None self.next_onnx_session = None print( @@ -300,41 +183,41 @@ class RVC: ) def update_settings(self, key: str, val: int | float | str): - if key == "onnxExecutionProvider" and self.onnx_session is not None: - if val == "CUDAExecutionProvider": - if self.settings.gpu < 0 or self.settings.gpu >= self.gpu_num: - self.settings.gpu = 0 - provider_options = [{"device_id": self.settings.gpu}] - self.onnx_session.set_providers( - providers=[val], provider_options=provider_options - ) - if hasattr(self, "hubert_onnx"): - self.hubert_onnx.set_providers( - providers=[val], provider_options=provider_options - ) - else: - self.onnx_session.set_providers(providers=[val]) - if hasattr(self, "hubert_onnx"): - self.hubert_onnx.set_providers(providers=[val]) - elif key == "onnxExecutionProvider" and self.onnx_session is None: - print("Onnx is not enabled. Please load model.") - return False - elif key in self.settings.intData: + # if key == "onnxExecutionProvider" and self.onnx_session is not None: + # if val == "CUDAExecutionProvider": + # if self.settings.gpu < 0 or self.settings.gpu >= self.gpu_num: + # self.settings.gpu = 0 + # provider_options = [{"device_id": self.settings.gpu}] + # self.onnx_session.set_providers( + # providers=[val], provider_options=provider_options + # ) + # if hasattr(self, "hubert_onnx"): + # self.hubert_onnx.set_providers( + # providers=[val], provider_options=provider_options + # ) + # else: + # self.onnx_session.set_providers(providers=[val]) + # if hasattr(self, "hubert_onnx"): + # self.hubert_onnx.set_providers(providers=[val]) + # elif key == "onnxExecutionProvider" and self.onnx_session is None: + # print("Onnx is not enabled. Please load model.") + # return False + if key in self.settings.intData: val = cast(int, val) - if ( - key == "gpu" - and val >= 0 - and val < self.gpu_num - and self.onnx_session is not None - ): - providers = self.onnx_session.get_providers() - print("Providers:", providers) - if "CUDAExecutionProvider" in providers: - provider_options = [{"device_id": self.settings.gpu}] - self.onnx_session.set_providers( - providers=["CUDAExecutionProvider"], - provider_options=provider_options, - ) + # if ( + # key == "gpu" + # and val >= 0 + # and val < self.gpu_num + # and self.onnx_session is not None + # ): + # providers = self.onnx_session.get_providers() + # print("Providers:", providers) + # if "CUDAExecutionProvider" in providers: + # provider_options = [{"device_id": self.settings.gpu}] + # self.onnx_session.set_providers( + # providers=["CUDAExecutionProvider"], + # provider_options=provider_options, + # ) if key == "modelSlotIndex": if int(val) < 0: return True @@ -355,9 +238,9 @@ class RVC: def get_info(self): data = asdict(self.settings) - data["onnxExecutionProviders"] = ( - self.onnx_session.get_providers() if self.onnx_session is not None else [] - ) + # data["onnxExecutionProviders"] = ( + # self.onnx_session.get_providers() if self.onnx_session is not None else [] + # ) files = ["configFile", "pyTorchModelFile", "onnxModelFile"] for f in files: if data[f] is not None and os.path.exists(data[f]): @@ -430,7 +313,12 @@ class RVC: with torch.no_grad(): repeat = 3 if self.is_half else 1 repeat *= self.settings.rvcQuality # 0 or 3 - vc = VC(self.settings.modelSamplingRate, dev, self.is_half, repeat) + vc = VC( + self.settings.modelSamplingRate, + torch.device("cuda:0"), + self.is_half, + repeat, + ) sid = 0 f0_up_key = self.settings.tran f0_method = self.settings.f0Detector @@ -459,13 +347,13 @@ class RVC: return result def _pyTorch_inference(self, data): - if hasattr(self, "net_g") is False or self.net_g is None: - print( - "[Voice Changer] No pyTorch session.", - hasattr(self, "net_g"), - self.net_g, - ) - raise NoModeLoadedException("pytorch") + # if hasattr(self, "net_g") is False or self.net_g is None: + # print( + # "[Voice Changer] No pyTorch session.", + # hasattr(self, "net_g"), + # self.net_g, + # ) + # raise NoModeLoadedException("pytorch") if self.settings.gpu < 0 or (self.gpu_num == 0 and self.mps_enabled is False): dev = torch.device("cpu") @@ -475,7 +363,10 @@ class RVC: dev = torch.device("cuda", index=self.settings.gpu) self.embedder = self.embedder.to(dev) - self.net_g = self.net_g.to(dev) + self.inferencer = self.inferencer.to(dev) + + # self.embedder.printDevice() + # self.inferencer.printDevice() audio = data[0] convertSize = data[1] @@ -498,9 +389,8 @@ class RVC: embChannels = self.settings.modelSlots[self.currentSlot].embChannels audio_out = vc.pipeline( - # self.hubert_model, self.embedder, - self.net_g, + self.inferencer, sid, audio, f0_up_key, diff --git a/server/voice_changer/RVC/RVCSettings.py b/server/voice_changer/RVC/RVCSettings.py index 8bfa50a5..2bc269c7 100644 --- a/server/voice_changer/RVC/RVCSettings.py +++ b/server/voice_changer/RVC/RVCSettings.py @@ -28,7 +28,7 @@ class RVCSettings: modelSlotIndex: int = -1 speakers: dict[str, int] = field(default_factory=lambda: {}) - + isHalf: int = 1 # 0:off, 1:on # ↓mutableな物だけ列挙 intData = [ "gpu", @@ -39,6 +39,7 @@ class RVCSettings: "modelSamplingRate", "silenceFront", "modelSlotIndex", + "isHalf", ] floatData = ["silentThreshold", "indexRatio"] strData = ["framework", "f0Detector"] diff --git a/server/voice_changer/RVC/ModelWrapper.py b/server/voice_changer/RVC/_ModelWrapper.py similarity index 100% rename from server/voice_changer/RVC/ModelWrapper.py rename to server/voice_changer/RVC/_ModelWrapper.py diff --git a/server/voice_changer/RVC/const.py b/server/voice_changer/RVC/const.py deleted file mode 100644 index 205a9ef3..00000000 --- a/server/voice_changer/RVC/const.py +++ /dev/null @@ -1,2 +0,0 @@ -RVC_MODEL_TYPE_RVC = 0 -RVC_MODEL_TYPE_WEBUI = 1 diff --git a/server/voice_changer/RVC/embedder/Embedder.py b/server/voice_changer/RVC/embedder/Embedder.py index 894e939e..bce0dd42 100644 --- a/server/voice_changer/RVC/embedder/Embedder.py +++ b/server/voice_changer/RVC/embedder/Embedder.py @@ -15,14 +15,24 @@ class Embedder(Protocol): model: Any | None = None def loadModel(self, file: str, dev: device, isHalf: bool = True): - self.embedderType = EnumEmbedderTypes.hubert - self.file = file - self.isHalf = isHalf - self.dev = dev + ... def extractFeatures(self, feats: torch.Tensor, embChannels=256) -> torch.Tensor: ... + def setProps( + self, + embedderType: EnumEmbedderTypes, + file: str, + dev: device, + isHalf: bool = True, + ): + self.embedderType = embedderType + self.file = file + self.isHalf = isHalf + self.dev = dev + print("hubert initialize dev::::", self.dev, dev) + def setHalf(self, isHalf: bool): self.isHalf = isHalf if self.model is not None and isHalf: @@ -59,3 +69,6 @@ class Embedder(Protocol): if self.model is not None: self.model = self.model.to(dev) return self + + def printDevice(self): + print("embedder device:", self.model.device) diff --git a/server/voice_changer/RVC/embedder/EmbedderManager.py b/server/voice_changer/RVC/embedder/EmbedderManager.py index be3b8ae0..a85b067d 100644 --- a/server/voice_changer/RVC/embedder/EmbedderManager.py +++ b/server/voice_changer/RVC/embedder/EmbedderManager.py @@ -16,7 +16,6 @@ class EmbedderManager: ) -> Embedder: if cls.currentEmbedder is None: print("[Voice Changer] generate new embedder. (no embedder)") - cls.loadEmbedder(embederType, file, isHalf, dev) cls.currentEmbedder = cls.loadEmbedder(embederType, file, isHalf, dev) elif cls.currentEmbedder.matchCondition(embederType, file) is False: print("[Voice Changer] generate new embedder. (not match)") @@ -24,7 +23,6 @@ class EmbedderManager: else: cls.currentEmbedder.setDevice(dev) cls.currentEmbedder.setHalf(isHalf) - print("RETURN", cls.currentEmbedder) return cls.currentEmbedder @classmethod diff --git a/server/voice_changer/RVC/embedder/FairseqContentvec.py b/server/voice_changer/RVC/embedder/FairseqContentvec.py index 4463575e..dffd1126 100644 --- a/server/voice_changer/RVC/embedder/FairseqContentvec.py +++ b/server/voice_changer/RVC/embedder/FairseqContentvec.py @@ -7,5 +7,5 @@ from voice_changer.RVC.embedder.FairseqHubert import FairseqHubert class FairseqContentvec(FairseqHubert): def loadModel(self, file: str, dev: device, isHalf: bool = True) -> Embedder: super().loadModel(file, dev, isHalf) - self.embedderType = EnumEmbedderTypes.contentvec + super().setProps(EnumEmbedderTypes.contentvec, file, dev, isHalf) return self diff --git a/server/voice_changer/RVC/embedder/FairseqHubert.py b/server/voice_changer/RVC/embedder/FairseqHubert.py index edd9ac23..ea4fd3ff 100644 --- a/server/voice_changer/RVC/embedder/FairseqHubert.py +++ b/server/voice_changer/RVC/embedder/FairseqHubert.py @@ -7,7 +7,7 @@ from fairseq import checkpoint_utils class FairseqHubert(Embedder): def loadModel(self, file: str, dev: device, isHalf: bool = True) -> Embedder: - super().loadModel(file, dev, isHalf) + super().setProps(EnumEmbedderTypes.hubert, file, dev, isHalf) models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( [file], @@ -21,7 +21,6 @@ class FairseqHubert(Embedder): model = model.half() self.model = model - self.embedderType = EnumEmbedderTypes.hubert return self def extractFeatures(self, feats: torch.Tensor, embChannels=256) -> torch.Tensor: @@ -38,6 +37,8 @@ class FairseqHubert(Embedder): "padding_mask": padding_mask, } + print("feat dev", self.dev) + with torch.no_grad(): logits = self.model.extract_features(**inputs) if embChannels == 256: diff --git a/server/voice_changer/RVC/embedder/FairseqHubertJp.py b/server/voice_changer/RVC/embedder/FairseqHubertJp.py index e3d1df13..b9172206 100644 --- a/server/voice_changer/RVC/embedder/FairseqHubertJp.py +++ b/server/voice_changer/RVC/embedder/FairseqHubertJp.py @@ -7,5 +7,5 @@ from voice_changer.RVC.embedder.FairseqHubert import FairseqHubert class FairseqHubertJp(FairseqHubert): def loadModel(self, file: str, dev: device, isHalf: bool = True) -> Embedder: super().loadModel(file, dev, isHalf) - self.embedderType = EnumEmbedderTypes.hubert_jp + super().setProps(EnumEmbedderTypes.hubert_jp, file, dev, isHalf) return self diff --git a/server/voice_changer/RVC/inferencer/Inferencer.py b/server/voice_changer/RVC/inferencer/Inferencer.py new file mode 100644 index 00000000..960f98c2 --- /dev/null +++ b/server/voice_changer/RVC/inferencer/Inferencer.py @@ -0,0 +1,58 @@ +from typing import Any, Protocol + +import torch +from torch import device + +from const import EnumInferenceTypes + + +class Inferencer(Protocol): + inferencerType: EnumInferenceTypes = EnumInferenceTypes.pyTorchRVC + file: str + isHalf: bool = True + dev: device + + model: Any | None = None + + def loadModel(self, file: str, dev: device, isHalf: bool = True): + ... + + def infer( + self, + feats: torch.Tensor, + pitch_length: torch.Tensor, + pitch: torch.Tensor | None, + pitchf: torch.Tensor | None, + sid: torch.Tensor, + ) -> torch.Tensor: + ... + + def setProps( + self, + inferencerType: EnumInferenceTypes, + file: str, + dev: device, + isHalf: bool = True, + ): + self.inferencerType = inferencerType + self.file = file + self.isHalf = isHalf + self.dev = dev + + def setHalf(self, isHalf: bool): + self.isHalf = isHalf + if self.model is not None and isHalf: + self.model = self.model.half() + + def setDevice(self, dev: device): + self.dev = dev + if self.model is not None: + self.model = self.model.to(self.dev) + + def to(self, dev: torch.device): + if self.model is not None: + self.model = self.model.to(dev) + return self + + def printDevice(self): + print("inferencer device:", self.model.device) diff --git a/server/voice_changer/RVC/inferencer/InferencerManager.py b/server/voice_changer/RVC/inferencer/InferencerManager.py new file mode 100644 index 00000000..7f2203cc --- /dev/null +++ b/server/voice_changer/RVC/inferencer/InferencerManager.py @@ -0,0 +1,42 @@ +from torch import device + +from const import EnumInferenceTypes +from voice_changer.RVC.embedder.Embedder import Embedder +from voice_changer.RVC.inferencer.Inferencer import Inferencer +from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInference +from voice_changer.RVC.inferencer.OnnxRVCInferencerNono import OnnxRVCInferenceNono +from voice_changer.RVC.inferencer.RVCInferencer import RVCInferencer +from voice_changer.RVC.inferencer.RVCInferencerNono import RVCInferencerNono +from voice_changer.RVC.inferencer.WebUIInferencer import WebUIInferencer +from voice_changer.RVC.inferencer.WebUIInferencerNono import WebUIInferencerNono + + +class InferencerManager: + currentInferencer: Inferencer | None = None + + @classmethod + def getInferencer( + cls, inferencerType: EnumInferenceTypes, file: str, isHalf: bool, dev: device + ) -> Inferencer: + cls.currentInferencer = cls.loadInferencer(inferencerType, file, isHalf, dev) + return cls.currentInferencer + + @classmethod + def loadInferencer( + cls, inferencerType: EnumInferenceTypes, file: str, isHalf: bool, dev: device + ) -> Embedder: + if inferencerType == EnumInferenceTypes.pyTorchRVC: + return RVCInferencer().loadModel(file, dev, isHalf) + elif inferencerType == EnumInferenceTypes.pyTorchRVCNono: + return RVCInferencerNono().loadModel(file, dev, isHalf) + elif inferencerType == EnumInferenceTypes.pyTorchWebUI: + return WebUIInferencer().loadModel(file, dev, isHalf) + elif inferencerType == EnumInferenceTypes.pyTorchWebUINono: + return WebUIInferencerNono().loadModel(file, dev, isHalf) + elif inferencerType == EnumInferenceTypes.onnxRVC: + return OnnxRVCInference().loadModel(file, dev, isHalf) + elif inferencerType == EnumInferenceTypes.onnxRVCNono: + return OnnxRVCInferenceNono().loadModel(file, dev, isHalf) + else: + # return hubert as default + raise RuntimeError("[Voice Changer] Inferencer not found", inferencerType) diff --git a/server/voice_changer/RVC/inferencer/OnnxRVCInferencer.py b/server/voice_changer/RVC/inferencer/OnnxRVCInferencer.py new file mode 100644 index 00000000..eb9df727 --- /dev/null +++ b/server/voice_changer/RVC/inferencer/OnnxRVCInferencer.py @@ -0,0 +1,78 @@ +import torch +from torch import device +import onnxruntime +from const import EnumInferenceTypes +from voice_changer.RVC.inferencer.Inferencer import Inferencer +import numpy as np + +providers = ["CPUExecutionProvider"] + + +class OnnxRVCInference(Inferencer): + def loadModel(self, file: str, dev: device, isHalf: bool = True): + super().setProps(EnumInferenceTypes.onnxRVC, file, dev, isHalf) + # ort_options = onnxruntime.SessionOptions() + # ort_options.intra_op_num_threads = 8 + + onnx_session = onnxruntime.InferenceSession( + self.onnx_model, providers=providers + ) + + # check half-precision + first_input_type = self.onnx_session.get_inputs()[0].type + if first_input_type == "tensor(float)": + self.isHalf = False + else: + self.isHalf = True + + self.model = onnx_session + return self + + def infer( + self, + feats: torch.Tensor, + pitch_length: torch.Tensor, + pitch: torch.Tensor | None, + pitchf: torch.Tensor | None, + sid: torch.Tensor, + ) -> torch.Tensor: + if pitch is None or pitchf is None: + raise RuntimeError("[Voice Changer] Pitch or Pitchf is not found.") + + if self.isHalf: + audio1 = self.model.run( + ["audio"], + { + "feats": feats.cpu().numpy().astype(np.float16), + "p_len": pitch_length.cpu().numpy().astype(np.int64), + "pitch": pitch.cpu().numpy().astype(np.int64), + "pitchf": pitchf.cpu().numpy().astype(np.float32), + "sid": sid.cpu().numpy().astype(np.int64), + }, + ) + else: + audio1 = self.model.run( + ["audio"], + { + "feats": feats.cpu().numpy().astype(np.float32), + "p_len": pitch_length.cpu().numpy().astype(np.int64), + "pitch": pitch.cpu().numpy().astype(np.int64), + "pitchf": pitchf.cpu().numpy().astype(np.float32), + "sid": sid.cpu().numpy().astype(np.int64), + }, + ) + + return torch.tensor(np.array(audio1)) + + def setHalf(self, isHalf: bool): + raise RuntimeError("half-precision is not changable.", self.isHalf) + + def setDevice(self, dev: device): + self.dev = dev + if self.model is not None: + self.model = self.model.to(self.dev) + + def to(self, dev: torch.device): + if self.model is not None: + self.model = self.model.to(dev) + return self diff --git a/server/voice_changer/RVC/inferencer/OnnxRVCInferencerNono.py b/server/voice_changer/RVC/inferencer/OnnxRVCInferencerNono.py new file mode 100644 index 00000000..67a673c4 --- /dev/null +++ b/server/voice_changer/RVC/inferencer/OnnxRVCInferencerNono.py @@ -0,0 +1,71 @@ +import torch +from torch import device +import onnxruntime +from const import EnumInferenceTypes +from voice_changer.RVC.inferencer.Inferencer import Inferencer +import numpy as np + +providers = ["CPUExecutionProvider"] + + +class OnnxRVCInferenceNono(Inferencer): + def loadModel(self, file: str, dev: device, isHalf: bool = True): + super().setProps(EnumInferenceTypes.onnxRVC, file, dev, isHalf) + # ort_options = onnxruntime.SessionOptions() + # ort_options.intra_op_num_threads = 8 + + onnx_session = onnxruntime.InferenceSession( + self.onnx_model, providers=providers + ) + + # check half-precision + first_input_type = self.onnx_session.get_inputs()[0].type + if first_input_type == "tensor(float)": + self.isHalf = False + else: + self.isHalf = True + + self.model = onnx_session + return self + + def infer( + self, + feats: torch.Tensor, + pitch_length: torch.Tensor, + pitch: torch.Tensor | None, + pitchf: torch.Tensor | None, + sid: torch.Tensor, + ) -> torch.Tensor: + if self.isHalf: + audio1 = self.model.run( + ["audio"], + { + "feats": feats.cpu().numpy().astype(np.float16), + "p_len": pitch_length.cpu().numpy().astype(np.int64), + "sid": sid.cpu().numpy().astype(np.int64), + }, + ) + else: + audio1 = self.model.run( + ["audio"], + { + "feats": feats.cpu().numpy().astype(np.float32), + "p_len": pitch_length.cpu().numpy().astype(np.int64), + "sid": sid.cpu().numpy().astype(np.int64), + }, + ) + + return torch.tensor(np.array(audio1)) + + def setHalf(self, isHalf: bool): + raise RuntimeError("half-precision is not changable.", self.isHalf) + + def setDevice(self, dev: device): + self.dev = dev + if self.model is not None: + self.model = self.model.to(self.dev) + + def to(self, dev: torch.device): + if self.model is not None: + self.model = self.model.to(dev) + return self diff --git a/server/voice_changer/RVC/inferencer/RVCInferencer.py b/server/voice_changer/RVC/inferencer/RVCInferencer.py new file mode 100644 index 00000000..7f6f5c9d --- /dev/null +++ b/server/voice_changer/RVC/inferencer/RVCInferencer.py @@ -0,0 +1,33 @@ +import torch +from torch import device + +from const import EnumInferenceTypes +from voice_changer.RVC.inferencer.Inferencer import Inferencer +from infer_pack.models import ( # type:ignore + SynthesizerTrnMs256NSFsid, +) + + +class RVCInferencer(Inferencer): + def loadModel(self, file: str, dev: device, isHalf: bool = True): + super().setProps(EnumInferenceTypes.pyTorchRVC, file, dev, isHalf) + cpt = torch.load(file, map_location="cpu") + model = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=isHalf) + + model.eval() + model.load_state_dict(cpt["weight"], strict=False) + if isHalf: + model = model.half() + + self.model = model + return self + + def infer( + self, + feats: torch.Tensor, + pitch_length: torch.Tensor, + pitch: torch.Tensor | None, + pitchf: torch.Tensor | None, + sid: torch.Tensor, + ) -> torch.Tensor: + return self.model.infer(feats, pitch_length, pitch, pitchf, sid) diff --git a/server/voice_changer/RVC/inferencer/RVCInferencerNono.py b/server/voice_changer/RVC/inferencer/RVCInferencerNono.py new file mode 100644 index 00000000..97a9c429 --- /dev/null +++ b/server/voice_changer/RVC/inferencer/RVCInferencerNono.py @@ -0,0 +1,33 @@ +import torch +from torch import device + +from const import EnumInferenceTypes +from voice_changer.RVC.inferencer.Inferencer import Inferencer +from infer_pack.models import ( # type:ignore + SynthesizerTrnMs256NSFsid_nono, +) + + +class RVCInferencerNono(Inferencer): + def loadModel(self, file: str, dev: device, isHalf: bool = True): + super().setProps(EnumInferenceTypes.pyTorchRVC, file, dev, isHalf) + cpt = torch.load(file, map_location="cpu") + model = SynthesizerTrnMs256NSFsid_nono(*cpt["config"], is_half=isHalf) + + model.eval() + model.load_state_dict(cpt["weight"], strict=False) + if isHalf: + model = model.half() + + self.model = model + return self + + def infer( + self, + feats: torch.Tensor, + pitch_length: torch.Tensor, + pitch: torch.Tensor | None, + pitchf: torch.Tensor | None, + sid: torch.Tensor, + ) -> torch.Tensor: + return self.model.infer(feats, pitch_length, sid) diff --git a/server/voice_changer/RVC/inferencer/WebUIInferencer.py b/server/voice_changer/RVC/inferencer/WebUIInferencer.py new file mode 100644 index 00000000..7945968d --- /dev/null +++ b/server/voice_changer/RVC/inferencer/WebUIInferencer.py @@ -0,0 +1,31 @@ +import torch +from torch import device + +from const import EnumInferenceTypes +from voice_changer.RVC.inferencer.Inferencer import Inferencer +from .models import SynthesizerTrnMsNSFsid + + +class WebUIInferencer(Inferencer): + def loadModel(self, file: str, dev: device, isHalf: bool = True): + super().setProps(EnumInferenceTypes.pyTorchRVC, file, dev, isHalf) + cpt = torch.load(file, map_location="cpu") + model = SynthesizerTrnMsNSFsid(**cpt["params"], is_half=isHalf) + + model.eval() + model.load_state_dict(cpt["weight"], strict=False) + if isHalf: + model = model.half() + + self.model = model + return self + + def infer( + self, + feats: torch.Tensor, + pitch_length: torch.Tensor, + pitch: torch.Tensor | None, + pitchf: torch.Tensor | None, + sid: torch.Tensor, + ) -> torch.Tensor: + return self.model.infer(feats, pitch_length, pitch, pitchf, sid) diff --git a/server/voice_changer/RVC/inferencer/WebUIInferencerNono.py b/server/voice_changer/RVC/inferencer/WebUIInferencerNono.py new file mode 100644 index 00000000..faa4c4c3 --- /dev/null +++ b/server/voice_changer/RVC/inferencer/WebUIInferencerNono.py @@ -0,0 +1,31 @@ +import torch +from torch import device + +from const import EnumInferenceTypes +from voice_changer.RVC.inferencer.Inferencer import Inferencer +from .models import SynthesizerTrnMsNSFsidNono + + +class WebUIInferencerNono(Inferencer): + def loadModel(self, file: str, dev: device, isHalf: bool = True): + super().setProps(EnumInferenceTypes.pyTorchRVC, file, dev, isHalf) + cpt = torch.load(file, map_location="cpu") + model = SynthesizerTrnMsNSFsidNono(**cpt["params"], is_half=isHalf) + + model.eval() + model.load_state_dict(cpt["weight"], strict=False) + if isHalf: + model = model.half() + + self.model = model + return self + + def infer( + self, + feats: torch.Tensor, + pitch_length: torch.Tensor, + pitch: torch.Tensor | None, + pitchf: torch.Tensor | None, + sid: torch.Tensor, + ) -> torch.Tensor: + return self.model.infer(feats, pitch_length, sid) diff --git a/server/voice_changer/RVC/inferencer/models.py b/server/voice_changer/RVC/inferencer/models.py new file mode 100644 index 00000000..4bd04651 --- /dev/null +++ b/server/voice_changer/RVC/inferencer/models.py @@ -0,0 +1,277 @@ +import math +import torch +from torch import nn + +from infer_pack.models import ( # type:ignore + GeneratorNSF, + PosteriorEncoder, + ResidualCouplingBlock, + Generator, +) +from infer_pack import commons, attentions # type:ignore + + +class TextEncoder(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + emb_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.emb_channels = emb_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.emb_phone = nn.Linear(emb_channels, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 is True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, phone, pitch, lengths): + if pitch is None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class SynthesizerTrnMsNSFsid(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + emb_channels, + sr, + **kwargs + ): + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + self.emb_channels = emb_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder( + inter_channels, + hidden_channels, + filter_channels, + emb_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward( + self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds + ): # 这里ds是id,[bs,1] + # print(1,pitch.shape)#[bs,t] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) + pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) + # print(-2,pitchf.shape,z_slice.shape) + o = self.dec(z_slice, pitchf, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class SynthesizerTrnMsNSFsidNono(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + emb_channels, + sr=None, + **kwargs + ): + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + self.emb_channels = emb_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder( + inter_channels, + hidden_channels, + filter_channels, + emb_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=False, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + o = self.dec(z_slice, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, phone, phone_lengths, sid, max_len=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], g=g) + return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/server/voice_changer/RVC/pipeline/PipelineGenerator.py b/server/voice_changer/RVC/pipeline/PipelineGenerator.py new file mode 100644 index 00000000..81f9b80f --- /dev/null +++ b/server/voice_changer/RVC/pipeline/PipelineGenerator.py @@ -0,0 +1,7 @@ +from voice_changer.RVC.ModelSlot import ModelSlot + + +class PipelineGenerator: + @classmethod + def generatePipeline(cls, modelSlot: ModelSlot): + pass