diff --git a/client/demo/src/components/demo/components/a01_MergeLab.Row.tsx b/client/demo/src/components/demo/components/a01_MergeLab.Row.tsx index b2063450..4030a226 100644 --- a/client/demo/src/components/demo/components/a01_MergeLab.Row.tsx +++ b/client/demo/src/components/demo/components/a01_MergeLab.Row.tsx @@ -20,7 +20,6 @@ export const MergeLabRow = (_props: MergeLabRowProps) => { }, "") }, [appState.serverSetting.serverSetting.modelSlots]) - console.log("newSlotChangeKey", newSlotChangeKey) useEffect(() => { // PyTorchモデルだけフィルタリング const models = appState.serverSetting.serverSetting.modelSlots.filter(x => { return x.pyTorchModelFile && x.pyTorchModelFile.length > 0 }) diff --git a/server/MMVCServerSIO.py b/server/MMVCServerSIO.py index 808a3bcc..032efaa7 100755 --- a/server/MMVCServerSIO.py +++ b/server/MMVCServerSIO.py @@ -53,6 +53,9 @@ def setupArgParser(): parser.add_argument( "--hubert_base", type=str, help="path to hubert_base model(pytorch)" ) + parser.add_argument( + "--hubert_base_jp", type=str, help="path to hubert_base_jp model(pytorch)" + ) parser.add_argument( "--hubert_soft", type=str, help="path to hubert_soft model(pytorch)" ) @@ -109,6 +112,7 @@ if __name__ == "MMVCServerSIO": content_vec_500_onnx=args.content_vec_500_onnx, content_vec_500_onnx_on=args.content_vec_500_onnx_on, hubert_base=args.hubert_base, + hubert_base_jp=args.hubert_base_jp, hubert_soft=args.hubert_soft, nsf_hifigan=args.nsf_hifigan, ) diff --git a/server/const.py b/server/const.py index 7f8e406c..1f7cbfe5 100644 --- a/server/const.py +++ b/server/const.py @@ -1,3 +1,4 @@ +from enum import Enum import os import sys import tempfile @@ -63,3 +64,9 @@ def getFrontendPath(): else "../client/demo/dist" ) return frontend_path + + +class EnumEmbedderTypes(Enum): + hubert = "hubert" + contentvec = "contentvec" + hubert_jp = "hubert_jp" diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py index bc5a18a6..7bde7b7a 100644 --- a/server/voice_changer/RVC/RVC.py +++ b/server/voice_changer/RVC/RVC.py @@ -7,6 +7,8 @@ from voice_changer.RVC.MergeModelRequest import MergeModelRequest from voice_changer.RVC.ModelWrapper import ModelWrapper from Exceptions import NoModeLoadedException from voice_changer.RVC.RVCSettings import RVCSettings +from voice_changer.RVC.embedder.Embedder import Embedder +from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager from voice_changer.utils.LoadModelParams import FilePaths, LoadModelParams from voice_changer.utils.VoiceChangerModel import AudioInOut from voice_changer.utils.VoiceChangerParams import VoiceChangerParams @@ -16,7 +18,7 @@ from typing import cast import numpy as np import torch -from fairseq import checkpoint_utils +# from fairseq import checkpoint_utils import traceback import faiss @@ -56,6 +58,7 @@ providers = [ class RVC: audio_buffer: AudioInOut | None = None + embedder: Embedder | None = None def __init__(self, params: VoiceChangerParams): self.initialLoad = True @@ -119,21 +122,24 @@ class RVC: asdict(self.settings.modelSlots[tmp_slot]), ) # hubertロード - try: - hubert_path = self.params.hubert_base - models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( - [hubert_path], - suffix="", - ) - model = models[0] - model.eval() - if self.is_half: - model = model.half() - self.hubert_model = model + # try: + # hubert_path = self.params.hubert_base + # hubert_path_jp = self.params.hubert_base_jp + # print(hubert_path, hubert_path_jp) - except Exception as e: - print("EXCEPTION during loading hubert/contentvec model", e) - print(" hubert_path:", hubert_path) + # models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( + # [hubert_path], + # suffix="", + # ) + # model = models[0] + # model.eval() + # if self.is_half: + # model = model.half() + # self.hubert_model = model + + # except Exception as e: + # print("EXCEPTION during loading hubert/contentvec model", e) + # print(" hubert_path:", hubert_path) # 初回のみロード if self.initialLoad or tmp_slot == self.currentSlot: @@ -256,6 +262,7 @@ class RVC: self.next_trans = self.settings.modelSlots[slot].defaultTrans self.next_samplingRate = self.settings.modelSlots[slot].samplingRate + self.next_embedder = self.settings.modelSlots[slot].embedder self.next_framework = ( "ONNX" if self.next_onnx_session is not None else "PyTorch" ) @@ -266,6 +273,17 @@ class RVC: print("[Voice Changer] Switching model..") # del self.net_g # del self.onnx_session + try: + self.embedder = EmbedderManager.getEmbedder( + self.next_embedder, + self.params.hubert_base, + True, + torch.device("cuda:0"), + ) + except Exception as e: + print("[Voice Changer] load hubert error", e) + traceback.print_exc() + self.net_g = self.next_net_g self.onnx_session = self.next_onnx_session self.feature_file = self.next_feature_file @@ -397,7 +415,8 @@ class RVC: else: dev = torch.device("cuda", index=self.settings.gpu) - self.hubert_model = self.hubert_model.to(dev) + # self.hubert_model = self.hubert_model.to(dev) + self.embedder = self.embedder.to(dev) audio = data[0] convertSize = data[1] @@ -420,7 +439,8 @@ class RVC: embChannels = self.settings.modelSlots[self.currentSlot].embChannels audio_out = vc.pipeline( - self.hubert_model, + # self.hubert_model, + self.embedder, self.onnx_session, sid, audio, @@ -454,7 +474,7 @@ class RVC: else: dev = torch.device("cuda", index=self.settings.gpu) - self.hubert_model = self.hubert_model.to(dev) + self.embedder = self.embedder.to(dev) self.net_g = self.net_g.to(dev) audio = data[0] @@ -478,7 +498,8 @@ class RVC: embChannels = self.settings.modelSlots[self.currentSlot].embChannels audio_out = vc.pipeline( - self.hubert_model, + # self.hubert_model, + self.embedder, self.net_g, sid, audio, @@ -620,9 +641,7 @@ class RVC: indexFilename=None, clusterTorchModelFilename=None, ) - params = { - "trans":req.defaultTrans - } + params = {"trans": req.defaultTrans} props: LoadModelParams = LoadModelParams( slot=targetSlot, isHalf=True, files=filePaths, params=json.dumps(params) ) diff --git a/server/voice_changer/RVC/custom_vc_infer_pipeline.py b/server/voice_changer/RVC/custom_vc_infer_pipeline.py index 0b631040..8f8c0ac7 100644 --- a/server/voice_changer/RVC/custom_vc_infer_pipeline.py +++ b/server/voice_changer/RVC/custom_vc_infer_pipeline.py @@ -6,6 +6,8 @@ import torch.nn.functional as F import scipy.signal as signal import pyworld +from voice_changer.RVC.embedder.Embedder import Embedder + class VC(object): def __init__(self, tgt_sr, device, is_half, x_pad): @@ -66,35 +68,11 @@ class VC(object): f0_mel[f0_mel > 255] = 255 f0_coarse = np.rint(f0_mel).astype(np.int) - # Volume Extract - # volume = self.extractVolume(audio, 512) - # volume = np.pad( - # volume.astype("float"), (start_frame, n_frames - len(volume) - start_frame) - # ) - - # return f0_coarse, f0bak, volume # 1-0 return f0_coarse, f0bak - # def extractVolume(self, audio, hopsize): - # n_frames = int(len(audio) // hopsize) + 1 - # audio2 = audio**2 - # audio2 = np.pad( - # audio2, - # (int(hopsize // 2), int((hopsize + 1) // 2)), - # mode="reflect", - # ) - # volume = np.array( - # [ - # np.mean(audio2[int(n * hopsize) : int((n + 1) * hopsize)]) # noqa:E203 - # for n in range(n_frames) - # ] - # ) - # volume = np.sqrt(volume) - # return volume - def pipeline( self, - embedder, + embedder: Embedder, model, sid, audio, @@ -141,24 +119,25 @@ class VC(object): # embedding padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) - if embChannels == 256: - inputs = { - "source": feats.to(self.device), - "padding_mask": padding_mask, - "output_layer": 9, # layer 9 - } - else: - inputs = { - "source": feats.to(self.device), - "padding_mask": padding_mask, - } + feats = embedder.extractFeatures(feats, embChannels) + # if embChannels == 256: + # inputs = { + # "source": feats.to(self.device), + # "padding_mask": padding_mask, + # "output_layer": 9, # layer 9 + # } + # else: + # inputs = { + # "source": feats.to(self.device), + # "padding_mask": padding_mask, + # } - with torch.no_grad(): - logits = embedder.extract_features(**inputs) - if embChannels == 256: - feats = embedder.final_proj(logits[0]) - else: - feats = logits[0] + # with torch.no_grad(): + # logits = embedder.extract_features(**inputs) + # if embChannels == 256: + # feats = embedder.final_proj(logits[0]) + # else: + # feats = logits[0] # Index - feature抽出 if ( diff --git a/server/voice_changer/RVC/embedder/Embedder.py b/server/voice_changer/RVC/embedder/Embedder.py new file mode 100644 index 00000000..894e939e --- /dev/null +++ b/server/voice_changer/RVC/embedder/Embedder.py @@ -0,0 +1,61 @@ +from typing import Any, Protocol + +import torch +from torch import device + +from const import EnumEmbedderTypes + + +class Embedder(Protocol): + embedderType: EnumEmbedderTypes = EnumEmbedderTypes.hubert + file: str + isHalf: bool = True + dev: device + + model: Any | None = None + + def loadModel(self, file: str, dev: device, isHalf: bool = True): + self.embedderType = EnumEmbedderTypes.hubert + self.file = file + self.isHalf = isHalf + self.dev = dev + + def extractFeatures(self, feats: torch.Tensor, embChannels=256) -> torch.Tensor: + ... + + def setHalf(self, isHalf: bool): + self.isHalf = isHalf + if self.model is not None and isHalf: + self.model = self.model.half() + + def setDevice(self, dev: device): + self.dev = dev + if self.model is not None: + self.model = self.model.to(self.dev) + + def matchCondition(self, embedderType: EnumEmbedderTypes, file: str) -> bool: + # Check Type + if self.embedderType != embedderType: + print( + "[Voice Changer] embeder type is not match", + self.embedderType, + embedderType, + ) + return False + + # Check File Path + if self.file != file: + print( + "[Voice Changer] embeder file is not match", + self.file, + file, + ) + return False + + else: + return True + + def to(self, dev: torch.device): + if self.model is not None: + self.model = self.model.to(dev) + return self diff --git a/server/voice_changer/RVC/embedder/EmbedderManager.py b/server/voice_changer/RVC/embedder/EmbedderManager.py new file mode 100644 index 00000000..be3b8ae0 --- /dev/null +++ b/server/voice_changer/RVC/embedder/EmbedderManager.py @@ -0,0 +1,42 @@ +from torch import device + +from const import EnumEmbedderTypes +from voice_changer.RVC.embedder.Embedder import Embedder +from voice_changer.RVC.embedder.FairseqContentvec import FairseqContentvec +from voice_changer.RVC.embedder.FairseqHubert import FairseqHubert +from voice_changer.RVC.embedder.FairseqHubertJp import FairseqHubertJp + + +class EmbedderManager: + currentEmbedder: Embedder | None = None + + @classmethod + def getEmbedder( + cls, embederType: EnumEmbedderTypes, file: str, isHalf: bool, dev: device + ) -> Embedder: + if cls.currentEmbedder is None: + print("[Voice Changer] generate new embedder. (no embedder)") + cls.loadEmbedder(embederType, file, isHalf, dev) + cls.currentEmbedder = cls.loadEmbedder(embederType, file, isHalf, dev) + elif cls.currentEmbedder.matchCondition(embederType, file) is False: + print("[Voice Changer] generate new embedder. (not match)") + cls.currentEmbedder = cls.loadEmbedder(embederType, file, isHalf, dev) + else: + cls.currentEmbedder.setDevice(dev) + cls.currentEmbedder.setHalf(isHalf) + print("RETURN", cls.currentEmbedder) + return cls.currentEmbedder + + @classmethod + def loadEmbedder( + cls, embederType: EnumEmbedderTypes, file: str, isHalf: bool, dev: device + ) -> Embedder: + if embederType == EnumEmbedderTypes.hubert: + return FairseqHubert().loadModel(file, dev, isHalf) + elif embederType == EnumEmbedderTypes.hubert_jp: # same as hubert + return FairseqHubertJp().loadModel(file, dev, isHalf) + elif embederType == EnumEmbedderTypes.contentvec: # same as hubert + return FairseqContentvec().loadModel(file, dev, isHalf) + else: + # return hubert as default + return FairseqHubert().loadModel(file, dev, isHalf) diff --git a/server/voice_changer/RVC/embedder/FairseqContentvec.py b/server/voice_changer/RVC/embedder/FairseqContentvec.py new file mode 100644 index 00000000..4463575e --- /dev/null +++ b/server/voice_changer/RVC/embedder/FairseqContentvec.py @@ -0,0 +1,11 @@ +from torch import device +from const import EnumEmbedderTypes +from voice_changer.RVC.embedder.Embedder import Embedder +from voice_changer.RVC.embedder.FairseqHubert import FairseqHubert + + +class FairseqContentvec(FairseqHubert): + def loadModel(self, file: str, dev: device, isHalf: bool = True) -> Embedder: + super().loadModel(file, dev, isHalf) + self.embedderType = EnumEmbedderTypes.contentvec + return self diff --git a/server/voice_changer/RVC/embedder/FairseqHubert.py b/server/voice_changer/RVC/embedder/FairseqHubert.py new file mode 100644 index 00000000..edd9ac23 --- /dev/null +++ b/server/voice_changer/RVC/embedder/FairseqHubert.py @@ -0,0 +1,47 @@ +import torch +from torch import device +from const import EnumEmbedderTypes +from voice_changer.RVC.embedder.Embedder import Embedder +from fairseq import checkpoint_utils + + +class FairseqHubert(Embedder): + def loadModel(self, file: str, dev: device, isHalf: bool = True) -> Embedder: + super().loadModel(file, dev, isHalf) + + models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( + [file], + suffix="", + ) + model = models[0] + model.eval() + + model = model.to(dev) + if isHalf: + model = model.half() + + self.model = model + self.embedderType = EnumEmbedderTypes.hubert + return self + + def extractFeatures(self, feats: torch.Tensor, embChannels=256) -> torch.Tensor: + padding_mask = torch.BoolTensor(feats.shape).to(self.dev).fill_(False) + if embChannels == 256: + inputs = { + "source": feats.to(self.dev), + "padding_mask": padding_mask, + "output_layer": 9, # layer 9 + } + else: + inputs = { + "source": feats.to(self.dev), + "padding_mask": padding_mask, + } + + with torch.no_grad(): + logits = self.model.extract_features(**inputs) + if embChannels == 256: + feats = self.model.final_proj(logits[0]) + else: + feats = logits[0] + return feats diff --git a/server/voice_changer/RVC/embedder/FairseqHubertJp.py b/server/voice_changer/RVC/embedder/FairseqHubertJp.py new file mode 100644 index 00000000..e3d1df13 --- /dev/null +++ b/server/voice_changer/RVC/embedder/FairseqHubertJp.py @@ -0,0 +1,11 @@ +from torch import device +from const import EnumEmbedderTypes +from voice_changer.RVC.embedder.Embedder import Embedder +from voice_changer.RVC.embedder.FairseqHubert import FairseqHubert + + +class FairseqHubertJp(FairseqHubert): + def loadModel(self, file: str, dev: device, isHalf: bool = True) -> Embedder: + super().loadModel(file, dev, isHalf) + self.embedderType = EnumEmbedderTypes.hubert_jp + return self diff --git a/server/voice_changer/utils/VoiceChangerParams.py b/server/voice_changer/utils/VoiceChangerParams.py index c755ba1d..87b1c112 100644 --- a/server/voice_changer/utils/VoiceChangerParams.py +++ b/server/voice_changer/utils/VoiceChangerParams.py @@ -2,10 +2,11 @@ from dataclasses import dataclass @dataclass -class VoiceChangerParams(): +class VoiceChangerParams: content_vec_500: str content_vec_500_onnx: str content_vec_500_onnx_on: bool hubert_base: str + hubert_base_jp: str hubert_soft: str nsf_hifigan: str