WIP: Japanese Hubert

This commit is contained in:
wataru 2023-05-02 12:11:00 +09:00
parent 4faee09aa1
commit 7e8e31a8c0
11 changed files with 247 additions and 66 deletions

View File

@ -20,7 +20,6 @@ export const MergeLabRow = (_props: MergeLabRowProps) => {
}, "") }, "")
}, [appState.serverSetting.serverSetting.modelSlots]) }, [appState.serverSetting.serverSetting.modelSlots])
console.log("newSlotChangeKey", newSlotChangeKey)
useEffect(() => { useEffect(() => {
// PyTorchモデルだけフィルタリング // PyTorchモデルだけフィルタリング
const models = appState.serverSetting.serverSetting.modelSlots.filter(x => { return x.pyTorchModelFile && x.pyTorchModelFile.length > 0 }) const models = appState.serverSetting.serverSetting.modelSlots.filter(x => { return x.pyTorchModelFile && x.pyTorchModelFile.length > 0 })

View File

@ -53,6 +53,9 @@ def setupArgParser():
parser.add_argument( parser.add_argument(
"--hubert_base", type=str, help="path to hubert_base model(pytorch)" "--hubert_base", type=str, help="path to hubert_base model(pytorch)"
) )
parser.add_argument(
"--hubert_base_jp", type=str, help="path to hubert_base_jp model(pytorch)"
)
parser.add_argument( parser.add_argument(
"--hubert_soft", type=str, help="path to hubert_soft model(pytorch)" "--hubert_soft", type=str, help="path to hubert_soft model(pytorch)"
) )
@ -109,6 +112,7 @@ if __name__ == "MMVCServerSIO":
content_vec_500_onnx=args.content_vec_500_onnx, content_vec_500_onnx=args.content_vec_500_onnx,
content_vec_500_onnx_on=args.content_vec_500_onnx_on, content_vec_500_onnx_on=args.content_vec_500_onnx_on,
hubert_base=args.hubert_base, hubert_base=args.hubert_base,
hubert_base_jp=args.hubert_base_jp,
hubert_soft=args.hubert_soft, hubert_soft=args.hubert_soft,
nsf_hifigan=args.nsf_hifigan, nsf_hifigan=args.nsf_hifigan,
) )

View File

@ -1,3 +1,4 @@
from enum import Enum
import os import os
import sys import sys
import tempfile import tempfile
@ -63,3 +64,9 @@ def getFrontendPath():
else "../client/demo/dist" else "../client/demo/dist"
) )
return frontend_path return frontend_path
class EnumEmbedderTypes(Enum):
hubert = "hubert"
contentvec = "contentvec"
hubert_jp = "hubert_jp"

View File

@ -7,6 +7,8 @@ from voice_changer.RVC.MergeModelRequest import MergeModelRequest
from voice_changer.RVC.ModelWrapper import ModelWrapper from voice_changer.RVC.ModelWrapper import ModelWrapper
from Exceptions import NoModeLoadedException from Exceptions import NoModeLoadedException
from voice_changer.RVC.RVCSettings import RVCSettings from voice_changer.RVC.RVCSettings import RVCSettings
from voice_changer.RVC.embedder.Embedder import Embedder
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
from voice_changer.utils.LoadModelParams import FilePaths, LoadModelParams from voice_changer.utils.LoadModelParams import FilePaths, LoadModelParams
from voice_changer.utils.VoiceChangerModel import AudioInOut from voice_changer.utils.VoiceChangerModel import AudioInOut
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
@ -16,7 +18,7 @@ from typing import cast
import numpy as np import numpy as np
import torch import torch
from fairseq import checkpoint_utils # from fairseq import checkpoint_utils
import traceback import traceback
import faiss import faiss
@ -56,6 +58,7 @@ providers = [
class RVC: class RVC:
audio_buffer: AudioInOut | None = None audio_buffer: AudioInOut | None = None
embedder: Embedder | None = None
def __init__(self, params: VoiceChangerParams): def __init__(self, params: VoiceChangerParams):
self.initialLoad = True self.initialLoad = True
@ -119,21 +122,24 @@ class RVC:
asdict(self.settings.modelSlots[tmp_slot]), asdict(self.settings.modelSlots[tmp_slot]),
) )
# hubertロード # hubertロード
try: # try:
hubert_path = self.params.hubert_base # hubert_path = self.params.hubert_base
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( # hubert_path_jp = self.params.hubert_base_jp
[hubert_path], # print(hubert_path, hubert_path_jp)
suffix="",
)
model = models[0]
model.eval()
if self.is_half:
model = model.half()
self.hubert_model = model
except Exception as e: # models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
print("EXCEPTION during loading hubert/contentvec model", e) # [hubert_path],
print(" hubert_path:", hubert_path) # suffix="",
# )
# model = models[0]
# model.eval()
# if self.is_half:
# model = model.half()
# self.hubert_model = model
# except Exception as e:
# print("EXCEPTION during loading hubert/contentvec model", e)
# print(" hubert_path:", hubert_path)
# 初回のみロード # 初回のみロード
if self.initialLoad or tmp_slot == self.currentSlot: if self.initialLoad or tmp_slot == self.currentSlot:
@ -256,6 +262,7 @@ class RVC:
self.next_trans = self.settings.modelSlots[slot].defaultTrans self.next_trans = self.settings.modelSlots[slot].defaultTrans
self.next_samplingRate = self.settings.modelSlots[slot].samplingRate self.next_samplingRate = self.settings.modelSlots[slot].samplingRate
self.next_embedder = self.settings.modelSlots[slot].embedder
self.next_framework = ( self.next_framework = (
"ONNX" if self.next_onnx_session is not None else "PyTorch" "ONNX" if self.next_onnx_session is not None else "PyTorch"
) )
@ -266,6 +273,17 @@ class RVC:
print("[Voice Changer] Switching model..") print("[Voice Changer] Switching model..")
# del self.net_g # del self.net_g
# del self.onnx_session # del self.onnx_session
try:
self.embedder = EmbedderManager.getEmbedder(
self.next_embedder,
self.params.hubert_base,
True,
torch.device("cuda:0"),
)
except Exception as e:
print("[Voice Changer] load hubert error", e)
traceback.print_exc()
self.net_g = self.next_net_g self.net_g = self.next_net_g
self.onnx_session = self.next_onnx_session self.onnx_session = self.next_onnx_session
self.feature_file = self.next_feature_file self.feature_file = self.next_feature_file
@ -397,7 +415,8 @@ class RVC:
else: else:
dev = torch.device("cuda", index=self.settings.gpu) dev = torch.device("cuda", index=self.settings.gpu)
self.hubert_model = self.hubert_model.to(dev) # self.hubert_model = self.hubert_model.to(dev)
self.embedder = self.embedder.to(dev)
audio = data[0] audio = data[0]
convertSize = data[1] convertSize = data[1]
@ -420,7 +439,8 @@ class RVC:
embChannels = self.settings.modelSlots[self.currentSlot].embChannels embChannels = self.settings.modelSlots[self.currentSlot].embChannels
audio_out = vc.pipeline( audio_out = vc.pipeline(
self.hubert_model, # self.hubert_model,
self.embedder,
self.onnx_session, self.onnx_session,
sid, sid,
audio, audio,
@ -454,7 +474,7 @@ class RVC:
else: else:
dev = torch.device("cuda", index=self.settings.gpu) dev = torch.device("cuda", index=self.settings.gpu)
self.hubert_model = self.hubert_model.to(dev) self.embedder = self.embedder.to(dev)
self.net_g = self.net_g.to(dev) self.net_g = self.net_g.to(dev)
audio = data[0] audio = data[0]
@ -478,7 +498,8 @@ class RVC:
embChannels = self.settings.modelSlots[self.currentSlot].embChannels embChannels = self.settings.modelSlots[self.currentSlot].embChannels
audio_out = vc.pipeline( audio_out = vc.pipeline(
self.hubert_model, # self.hubert_model,
self.embedder,
self.net_g, self.net_g,
sid, sid,
audio, audio,
@ -620,9 +641,7 @@ class RVC:
indexFilename=None, indexFilename=None,
clusterTorchModelFilename=None, clusterTorchModelFilename=None,
) )
params = { params = {"trans": req.defaultTrans}
"trans":req.defaultTrans
}
props: LoadModelParams = LoadModelParams( props: LoadModelParams = LoadModelParams(
slot=targetSlot, isHalf=True, files=filePaths, params=json.dumps(params) slot=targetSlot, isHalf=True, files=filePaths, params=json.dumps(params)
) )

View File

@ -6,6 +6,8 @@ import torch.nn.functional as F
import scipy.signal as signal import scipy.signal as signal
import pyworld import pyworld
from voice_changer.RVC.embedder.Embedder import Embedder
class VC(object): class VC(object):
def __init__(self, tgt_sr, device, is_half, x_pad): def __init__(self, tgt_sr, device, is_half, x_pad):
@ -66,35 +68,11 @@ class VC(object):
f0_mel[f0_mel > 255] = 255 f0_mel[f0_mel > 255] = 255
f0_coarse = np.rint(f0_mel).astype(np.int) f0_coarse = np.rint(f0_mel).astype(np.int)
# Volume Extract
# volume = self.extractVolume(audio, 512)
# volume = np.pad(
# volume.astype("float"), (start_frame, n_frames - len(volume) - start_frame)
# )
# return f0_coarse, f0bak, volume # 1-0
return f0_coarse, f0bak return f0_coarse, f0bak
# def extractVolume(self, audio, hopsize):
# n_frames = int(len(audio) // hopsize) + 1
# audio2 = audio**2
# audio2 = np.pad(
# audio2,
# (int(hopsize // 2), int((hopsize + 1) // 2)),
# mode="reflect",
# )
# volume = np.array(
# [
# np.mean(audio2[int(n * hopsize) : int((n + 1) * hopsize)]) # noqa:E203
# for n in range(n_frames)
# ]
# )
# volume = np.sqrt(volume)
# return volume
def pipeline( def pipeline(
self, self,
embedder, embedder: Embedder,
model, model,
sid, sid,
audio, audio,
@ -141,24 +119,25 @@ class VC(object):
# embedding # embedding
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
if embChannels == 256: feats = embedder.extractFeatures(feats, embChannels)
inputs = { # if embChannels == 256:
"source": feats.to(self.device), # inputs = {
"padding_mask": padding_mask, # "source": feats.to(self.device),
"output_layer": 9, # layer 9 # "padding_mask": padding_mask,
} # "output_layer": 9, # layer 9
else: # }
inputs = { # else:
"source": feats.to(self.device), # inputs = {
"padding_mask": padding_mask, # "source": feats.to(self.device),
} # "padding_mask": padding_mask,
# }
with torch.no_grad(): # with torch.no_grad():
logits = embedder.extract_features(**inputs) # logits = embedder.extract_features(**inputs)
if embChannels == 256: # if embChannels == 256:
feats = embedder.final_proj(logits[0]) # feats = embedder.final_proj(logits[0])
else: # else:
feats = logits[0] # feats = logits[0]
# Index - feature抽出 # Index - feature抽出
if ( if (

View File

@ -0,0 +1,61 @@
from typing import Any, Protocol
import torch
from torch import device
from const import EnumEmbedderTypes
class Embedder(Protocol):
embedderType: EnumEmbedderTypes = EnumEmbedderTypes.hubert
file: str
isHalf: bool = True
dev: device
model: Any | None = None
def loadModel(self, file: str, dev: device, isHalf: bool = True):
self.embedderType = EnumEmbedderTypes.hubert
self.file = file
self.isHalf = isHalf
self.dev = dev
def extractFeatures(self, feats: torch.Tensor, embChannels=256) -> torch.Tensor:
...
def setHalf(self, isHalf: bool):
self.isHalf = isHalf
if self.model is not None and isHalf:
self.model = self.model.half()
def setDevice(self, dev: device):
self.dev = dev
if self.model is not None:
self.model = self.model.to(self.dev)
def matchCondition(self, embedderType: EnumEmbedderTypes, file: str) -> bool:
# Check Type
if self.embedderType != embedderType:
print(
"[Voice Changer] embeder type is not match",
self.embedderType,
embedderType,
)
return False
# Check File Path
if self.file != file:
print(
"[Voice Changer] embeder file is not match",
self.file,
file,
)
return False
else:
return True
def to(self, dev: torch.device):
if self.model is not None:
self.model = self.model.to(dev)
return self

View File

@ -0,0 +1,42 @@
from torch import device
from const import EnumEmbedderTypes
from voice_changer.RVC.embedder.Embedder import Embedder
from voice_changer.RVC.embedder.FairseqContentvec import FairseqContentvec
from voice_changer.RVC.embedder.FairseqHubert import FairseqHubert
from voice_changer.RVC.embedder.FairseqHubertJp import FairseqHubertJp
class EmbedderManager:
currentEmbedder: Embedder | None = None
@classmethod
def getEmbedder(
cls, embederType: EnumEmbedderTypes, file: str, isHalf: bool, dev: device
) -> Embedder:
if cls.currentEmbedder is None:
print("[Voice Changer] generate new embedder. (no embedder)")
cls.loadEmbedder(embederType, file, isHalf, dev)
cls.currentEmbedder = cls.loadEmbedder(embederType, file, isHalf, dev)
elif cls.currentEmbedder.matchCondition(embederType, file) is False:
print("[Voice Changer] generate new embedder. (not match)")
cls.currentEmbedder = cls.loadEmbedder(embederType, file, isHalf, dev)
else:
cls.currentEmbedder.setDevice(dev)
cls.currentEmbedder.setHalf(isHalf)
print("RETURN", cls.currentEmbedder)
return cls.currentEmbedder
@classmethod
def loadEmbedder(
cls, embederType: EnumEmbedderTypes, file: str, isHalf: bool, dev: device
) -> Embedder:
if embederType == EnumEmbedderTypes.hubert:
return FairseqHubert().loadModel(file, dev, isHalf)
elif embederType == EnumEmbedderTypes.hubert_jp: # same as hubert
return FairseqHubertJp().loadModel(file, dev, isHalf)
elif embederType == EnumEmbedderTypes.contentvec: # same as hubert
return FairseqContentvec().loadModel(file, dev, isHalf)
else:
# return hubert as default
return FairseqHubert().loadModel(file, dev, isHalf)

View File

@ -0,0 +1,11 @@
from torch import device
from const import EnumEmbedderTypes
from voice_changer.RVC.embedder.Embedder import Embedder
from voice_changer.RVC.embedder.FairseqHubert import FairseqHubert
class FairseqContentvec(FairseqHubert):
def loadModel(self, file: str, dev: device, isHalf: bool = True) -> Embedder:
super().loadModel(file, dev, isHalf)
self.embedderType = EnumEmbedderTypes.contentvec
return self

View File

@ -0,0 +1,47 @@
import torch
from torch import device
from const import EnumEmbedderTypes
from voice_changer.RVC.embedder.Embedder import Embedder
from fairseq import checkpoint_utils
class FairseqHubert(Embedder):
def loadModel(self, file: str, dev: device, isHalf: bool = True) -> Embedder:
super().loadModel(file, dev, isHalf)
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
[file],
suffix="",
)
model = models[0]
model.eval()
model = model.to(dev)
if isHalf:
model = model.half()
self.model = model
self.embedderType = EnumEmbedderTypes.hubert
return self
def extractFeatures(self, feats: torch.Tensor, embChannels=256) -> torch.Tensor:
padding_mask = torch.BoolTensor(feats.shape).to(self.dev).fill_(False)
if embChannels == 256:
inputs = {
"source": feats.to(self.dev),
"padding_mask": padding_mask,
"output_layer": 9, # layer 9
}
else:
inputs = {
"source": feats.to(self.dev),
"padding_mask": padding_mask,
}
with torch.no_grad():
logits = self.model.extract_features(**inputs)
if embChannels == 256:
feats = self.model.final_proj(logits[0])
else:
feats = logits[0]
return feats

View File

@ -0,0 +1,11 @@
from torch import device
from const import EnumEmbedderTypes
from voice_changer.RVC.embedder.Embedder import Embedder
from voice_changer.RVC.embedder.FairseqHubert import FairseqHubert
class FairseqHubertJp(FairseqHubert):
def loadModel(self, file: str, dev: device, isHalf: bool = True) -> Embedder:
super().loadModel(file, dev, isHalf)
self.embedderType = EnumEmbedderTypes.hubert_jp
return self

View File

@ -2,10 +2,11 @@ from dataclasses import dataclass
@dataclass @dataclass
class VoiceChangerParams(): class VoiceChangerParams:
content_vec_500: str content_vec_500: str
content_vec_500_onnx: str content_vec_500_onnx: str
content_vec_500_onnx_on: bool content_vec_500_onnx_on: bool
hubert_base: str hubert_base: str
hubert_base_jp: str
hubert_soft: str hubert_soft: str
nsf_hifigan: str nsf_hifigan: str